1/*
2 * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "asm/assembler.hpp"
27#include "asm/assembler.inline.hpp"
28#include "compiler/disassembler.hpp"
29#include "gc/shared/cardTableModRefBS.hpp"
30#include "gc/shared/collectedHeap.inline.hpp"
31#include "interpreter/interpreter.hpp"
32#include "memory/resourceArea.hpp"
33#include "memory/universe.hpp"
34#include "oops/klass.inline.hpp"
35#include "prims/methodHandles.hpp"
36#include "runtime/biasedLocking.hpp"
37#include "runtime/interfaceSupport.hpp"
38#include "runtime/objectMonitor.hpp"
39#include "runtime/os.hpp"
40#include "runtime/sharedRuntime.hpp"
41#include "runtime/stubRoutines.hpp"
42#include "runtime/thread.hpp"
43#include "utilities/macros.hpp"
44#if INCLUDE_ALL_GCS
45#include "gc/g1/g1CollectedHeap.inline.hpp"
46#include "gc/g1/g1SATBCardTableModRefBS.hpp"
47#include "gc/g1/heapRegion.hpp"
48#endif // INCLUDE_ALL_GCS
49#include "crc32c.h"
50#ifdef COMPILER2
51#include "opto/intrinsicnode.hpp"
52#endif
53
54#ifdef PRODUCT
55#define BLOCK_COMMENT(str) /* nothing */
56#define STOP(error) stop(error)
57#else
58#define BLOCK_COMMENT(str) block_comment(str)
59#define STOP(error) block_comment(error); stop(error)
60#endif
61
62#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
63
64#ifdef ASSERT
65bool AbstractAssembler::pd_check_instruction_mark() { return true; }
66#endif
67
68static Assembler::Condition reverse[] = {
69    Assembler::noOverflow     /* overflow      = 0x0 */ ,
70    Assembler::overflow       /* noOverflow    = 0x1 */ ,
71    Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
72    Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
73    Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
74    Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
75    Assembler::above          /* belowEqual    = 0x6 */ ,
76    Assembler::belowEqual     /* above         = 0x7 */ ,
77    Assembler::positive       /* negative      = 0x8 */ ,
78    Assembler::negative       /* positive      = 0x9 */ ,
79    Assembler::noParity       /* parity        = 0xa */ ,
80    Assembler::parity         /* noParity      = 0xb */ ,
81    Assembler::greaterEqual   /* less          = 0xc */ ,
82    Assembler::less           /* greaterEqual  = 0xd */ ,
83    Assembler::greater        /* lessEqual     = 0xe */ ,
84    Assembler::lessEqual      /* greater       = 0xf, */
85
86};
87
88
89// Implementation of MacroAssembler
90
91// First all the versions that have distinct versions depending on 32/64 bit
92// Unless the difference is trivial (1 line or so).
93
94#ifndef _LP64
95
96// 32bit versions
97
98Address MacroAssembler::as_Address(AddressLiteral adr) {
99  return Address(adr.target(), adr.rspec());
100}
101
102Address MacroAssembler::as_Address(ArrayAddress adr) {
103  return Address::make_array(adr);
104}
105
106void MacroAssembler::call_VM_leaf_base(address entry_point,
107                                       int number_of_arguments) {
108  call(RuntimeAddress(entry_point));
109  increment(rsp, number_of_arguments * wordSize);
110}
111
112void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
113  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
114}
115
116void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
117  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
118}
119
120void MacroAssembler::cmpoop(Address src1, jobject obj) {
121  cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
122}
123
124void MacroAssembler::cmpoop(Register src1, jobject obj) {
125  cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
126}
127
128void MacroAssembler::extend_sign(Register hi, Register lo) {
129  // According to Intel Doc. AP-526, "Integer Divide", p.18.
130  if (VM_Version::is_P6() && hi == rdx && lo == rax) {
131    cdql();
132  } else {
133    movl(hi, lo);
134    sarl(hi, 31);
135  }
136}
137
138void MacroAssembler::jC2(Register tmp, Label& L) {
139  // set parity bit if FPU flag C2 is set (via rax)
140  save_rax(tmp);
141  fwait(); fnstsw_ax();
142  sahf();
143  restore_rax(tmp);
144  // branch
145  jcc(Assembler::parity, L);
146}
147
148void MacroAssembler::jnC2(Register tmp, Label& L) {
149  // set parity bit if FPU flag C2 is set (via rax)
150  save_rax(tmp);
151  fwait(); fnstsw_ax();
152  sahf();
153  restore_rax(tmp);
154  // branch
155  jcc(Assembler::noParity, L);
156}
157
158// 32bit can do a case table jump in one instruction but we no longer allow the base
159// to be installed in the Address class
160void MacroAssembler::jump(ArrayAddress entry) {
161  jmp(as_Address(entry));
162}
163
164// Note: y_lo will be destroyed
165void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
166  // Long compare for Java (semantics as described in JVM spec.)
167  Label high, low, done;
168
169  cmpl(x_hi, y_hi);
170  jcc(Assembler::less, low);
171  jcc(Assembler::greater, high);
172  // x_hi is the return register
173  xorl(x_hi, x_hi);
174  cmpl(x_lo, y_lo);
175  jcc(Assembler::below, low);
176  jcc(Assembler::equal, done);
177
178  bind(high);
179  xorl(x_hi, x_hi);
180  increment(x_hi);
181  jmp(done);
182
183  bind(low);
184  xorl(x_hi, x_hi);
185  decrementl(x_hi);
186
187  bind(done);
188}
189
190void MacroAssembler::lea(Register dst, AddressLiteral src) {
191    mov_literal32(dst, (int32_t)src.target(), src.rspec());
192}
193
194void MacroAssembler::lea(Address dst, AddressLiteral adr) {
195  // leal(dst, as_Address(adr));
196  // see note in movl as to why we must use a move
197  mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
198}
199
200void MacroAssembler::leave() {
201  mov(rsp, rbp);
202  pop(rbp);
203}
204
205void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
206  // Multiplication of two Java long values stored on the stack
207  // as illustrated below. Result is in rdx:rax.
208  //
209  // rsp ---> [  ??  ] \               \
210  //            ....    | y_rsp_offset  |
211  //          [ y_lo ] /  (in bytes)    | x_rsp_offset
212  //          [ y_hi ]                  | (in bytes)
213  //            ....                    |
214  //          [ x_lo ]                 /
215  //          [ x_hi ]
216  //            ....
217  //
218  // Basic idea: lo(result) = lo(x_lo * y_lo)
219  //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
220  Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
221  Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
222  Label quick;
223  // load x_hi, y_hi and check if quick
224  // multiplication is possible
225  movl(rbx, x_hi);
226  movl(rcx, y_hi);
227  movl(rax, rbx);
228  orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
229  jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
230  // do full multiplication
231  // 1st step
232  mull(y_lo);                                    // x_hi * y_lo
233  movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
234  // 2nd step
235  movl(rax, x_lo);
236  mull(rcx);                                     // x_lo * y_hi
237  addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
238  // 3rd step
239  bind(quick);                                   // note: rbx, = 0 if quick multiply!
240  movl(rax, x_lo);
241  mull(y_lo);                                    // x_lo * y_lo
242  addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
243}
244
245void MacroAssembler::lneg(Register hi, Register lo) {
246  negl(lo);
247  adcl(hi, 0);
248  negl(hi);
249}
250
251void MacroAssembler::lshl(Register hi, Register lo) {
252  // Java shift left long support (semantics as described in JVM spec., p.305)
253  // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
254  // shift value is in rcx !
255  assert(hi != rcx, "must not use rcx");
256  assert(lo != rcx, "must not use rcx");
257  const Register s = rcx;                        // shift count
258  const int      n = BitsPerWord;
259  Label L;
260  andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
261  cmpl(s, n);                                    // if (s < n)
262  jcc(Assembler::less, L);                       // else (s >= n)
263  movl(hi, lo);                                  // x := x << n
264  xorl(lo, lo);
265  // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
266  bind(L);                                       // s (mod n) < n
267  shldl(hi, lo);                                 // x := x << s
268  shll(lo);
269}
270
271
272void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
273  // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
274  // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
275  assert(hi != rcx, "must not use rcx");
276  assert(lo != rcx, "must not use rcx");
277  const Register s = rcx;                        // shift count
278  const int      n = BitsPerWord;
279  Label L;
280  andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
281  cmpl(s, n);                                    // if (s < n)
282  jcc(Assembler::less, L);                       // else (s >= n)
283  movl(lo, hi);                                  // x := x >> n
284  if (sign_extension) sarl(hi, 31);
285  else                xorl(hi, hi);
286  // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
287  bind(L);                                       // s (mod n) < n
288  shrdl(lo, hi);                                 // x := x >> s
289  if (sign_extension) sarl(hi);
290  else                shrl(hi);
291}
292
293void MacroAssembler::movoop(Register dst, jobject obj) {
294  mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
295}
296
297void MacroAssembler::movoop(Address dst, jobject obj) {
298  mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
299}
300
301void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
302  mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
303}
304
305void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
306  mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
307}
308
309void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
310  // scratch register is not used,
311  // it is defined to match parameters of 64-bit version of this method.
312  if (src.is_lval()) {
313    mov_literal32(dst, (intptr_t)src.target(), src.rspec());
314  } else {
315    movl(dst, as_Address(src));
316  }
317}
318
319void MacroAssembler::movptr(ArrayAddress dst, Register src) {
320  movl(as_Address(dst), src);
321}
322
323void MacroAssembler::movptr(Register dst, ArrayAddress src) {
324  movl(dst, as_Address(src));
325}
326
327// src should NEVER be a real pointer. Use AddressLiteral for true pointers
328void MacroAssembler::movptr(Address dst, intptr_t src) {
329  movl(dst, src);
330}
331
332
333void MacroAssembler::pop_callee_saved_registers() {
334  pop(rcx);
335  pop(rdx);
336  pop(rdi);
337  pop(rsi);
338}
339
340void MacroAssembler::pop_fTOS() {
341  fld_d(Address(rsp, 0));
342  addl(rsp, 2 * wordSize);
343}
344
345void MacroAssembler::push_callee_saved_registers() {
346  push(rsi);
347  push(rdi);
348  push(rdx);
349  push(rcx);
350}
351
352void MacroAssembler::push_fTOS() {
353  subl(rsp, 2 * wordSize);
354  fstp_d(Address(rsp, 0));
355}
356
357
358void MacroAssembler::pushoop(jobject obj) {
359  push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
360}
361
362void MacroAssembler::pushklass(Metadata* obj) {
363  push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
364}
365
366void MacroAssembler::pushptr(AddressLiteral src) {
367  if (src.is_lval()) {
368    push_literal32((int32_t)src.target(), src.rspec());
369  } else {
370    pushl(as_Address(src));
371  }
372}
373
374void MacroAssembler::set_word_if_not_zero(Register dst) {
375  xorl(dst, dst);
376  set_byte_if_not_zero(dst);
377}
378
379static void pass_arg0(MacroAssembler* masm, Register arg) {
380  masm->push(arg);
381}
382
383static void pass_arg1(MacroAssembler* masm, Register arg) {
384  masm->push(arg);
385}
386
387static void pass_arg2(MacroAssembler* masm, Register arg) {
388  masm->push(arg);
389}
390
391static void pass_arg3(MacroAssembler* masm, Register arg) {
392  masm->push(arg);
393}
394
395#ifndef PRODUCT
396extern "C" void findpc(intptr_t x);
397#endif
398
399void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
400  // In order to get locks to work, we need to fake a in_VM state
401  JavaThread* thread = JavaThread::current();
402  JavaThreadState saved_state = thread->thread_state();
403  thread->set_thread_state(_thread_in_vm);
404  if (ShowMessageBoxOnError) {
405    JavaThread* thread = JavaThread::current();
406    JavaThreadState saved_state = thread->thread_state();
407    thread->set_thread_state(_thread_in_vm);
408    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
409      ttyLocker ttyl;
410      BytecodeCounter::print();
411    }
412    // To see where a verify_oop failed, get $ebx+40/X for this frame.
413    // This is the value of eip which points to where verify_oop will return.
414    if (os::message_box(msg, "Execution stopped, print registers?")) {
415      print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
416      BREAKPOINT;
417    }
418  } else {
419    ttyLocker ttyl;
420    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
421  }
422  // Don't assert holding the ttyLock
423    assert(false, "DEBUG MESSAGE: %s", msg);
424  ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
425}
426
427void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
428  ttyLocker ttyl;
429  FlagSetting fs(Debugging, true);
430  tty->print_cr("eip = 0x%08x", eip);
431#ifndef PRODUCT
432  if ((WizardMode || Verbose) && PrintMiscellaneous) {
433    tty->cr();
434    findpc(eip);
435    tty->cr();
436  }
437#endif
438#define PRINT_REG(rax) \
439  { tty->print("%s = ", #rax); os::print_location(tty, rax); }
440  PRINT_REG(rax);
441  PRINT_REG(rbx);
442  PRINT_REG(rcx);
443  PRINT_REG(rdx);
444  PRINT_REG(rdi);
445  PRINT_REG(rsi);
446  PRINT_REG(rbp);
447  PRINT_REG(rsp);
448#undef PRINT_REG
449  // Print some words near top of staack.
450  int* dump_sp = (int*) rsp;
451  for (int col1 = 0; col1 < 8; col1++) {
452    tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
453    os::print_location(tty, *dump_sp++);
454  }
455  for (int row = 0; row < 16; row++) {
456    tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
457    for (int col = 0; col < 8; col++) {
458      tty->print(" 0x%08x", *dump_sp++);
459    }
460    tty->cr();
461  }
462  // Print some instructions around pc:
463  Disassembler::decode((address)eip-64, (address)eip);
464  tty->print_cr("--------");
465  Disassembler::decode((address)eip, (address)eip+32);
466}
467
468void MacroAssembler::stop(const char* msg) {
469  ExternalAddress message((address)msg);
470  // push address of message
471  pushptr(message.addr());
472  { Label L; call(L, relocInfo::none); bind(L); }     // push eip
473  pusha();                                            // push registers
474  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
475  hlt();
476}
477
478void MacroAssembler::warn(const char* msg) {
479  push_CPU_state();
480
481  ExternalAddress message((address) msg);
482  // push address of message
483  pushptr(message.addr());
484
485  call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
486  addl(rsp, wordSize);       // discard argument
487  pop_CPU_state();
488}
489
490void MacroAssembler::print_state() {
491  { Label L; call(L, relocInfo::none); bind(L); }     // push eip
492  pusha();                                            // push registers
493
494  push_CPU_state();
495  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
496  pop_CPU_state();
497
498  popa();
499  addl(rsp, wordSize);
500}
501
502#else // _LP64
503
504// 64 bit versions
505
506Address MacroAssembler::as_Address(AddressLiteral adr) {
507  // amd64 always does this as a pc-rel
508  // we can be absolute or disp based on the instruction type
509  // jmp/call are displacements others are absolute
510  assert(!adr.is_lval(), "must be rval");
511  assert(reachable(adr), "must be");
512  return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
513
514}
515
516Address MacroAssembler::as_Address(ArrayAddress adr) {
517  AddressLiteral base = adr.base();
518  lea(rscratch1, base);
519  Address index = adr.index();
520  assert(index._disp == 0, "must not have disp"); // maybe it can?
521  Address array(rscratch1, index._index, index._scale, index._disp);
522  return array;
523}
524
525void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
526  Label L, E;
527
528#ifdef _WIN64
529  // Windows always allocates space for it's register args
530  assert(num_args <= 4, "only register arguments supported");
531  subq(rsp,  frame::arg_reg_save_area_bytes);
532#endif
533
534  // Align stack if necessary
535  testl(rsp, 15);
536  jcc(Assembler::zero, L);
537
538  subq(rsp, 8);
539  {
540    call(RuntimeAddress(entry_point));
541  }
542  addq(rsp, 8);
543  jmp(E);
544
545  bind(L);
546  {
547    call(RuntimeAddress(entry_point));
548  }
549
550  bind(E);
551
552#ifdef _WIN64
553  // restore stack pointer
554  addq(rsp, frame::arg_reg_save_area_bytes);
555#endif
556
557}
558
559void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
560  assert(!src2.is_lval(), "should use cmpptr");
561
562  if (reachable(src2)) {
563    cmpq(src1, as_Address(src2));
564  } else {
565    lea(rscratch1, src2);
566    Assembler::cmpq(src1, Address(rscratch1, 0));
567  }
568}
569
570int MacroAssembler::corrected_idivq(Register reg) {
571  // Full implementation of Java ldiv and lrem; checks for special
572  // case as described in JVM spec., p.243 & p.271.  The function
573  // returns the (pc) offset of the idivl instruction - may be needed
574  // for implicit exceptions.
575  //
576  //         normal case                           special case
577  //
578  // input : rax: dividend                         min_long
579  //         reg: divisor   (may not be eax/edx)   -1
580  //
581  // output: rax: quotient  (= rax idiv reg)       min_long
582  //         rdx: remainder (= rax irem reg)       0
583  assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
584  static const int64_t min_long = 0x8000000000000000;
585  Label normal_case, special_case;
586
587  // check for special case
588  cmp64(rax, ExternalAddress((address) &min_long));
589  jcc(Assembler::notEqual, normal_case);
590  xorl(rdx, rdx); // prepare rdx for possible special case (where
591                  // remainder = 0)
592  cmpq(reg, -1);
593  jcc(Assembler::equal, special_case);
594
595  // handle normal case
596  bind(normal_case);
597  cdqq();
598  int idivq_offset = offset();
599  idivq(reg);
600
601  // normal and special case exit
602  bind(special_case);
603
604  return idivq_offset;
605}
606
607void MacroAssembler::decrementq(Register reg, int value) {
608  if (value == min_jint) { subq(reg, value); return; }
609  if (value <  0) { incrementq(reg, -value); return; }
610  if (value == 0) {                        ; return; }
611  if (value == 1 && UseIncDec) { decq(reg) ; return; }
612  /* else */      { subq(reg, value)       ; return; }
613}
614
615void MacroAssembler::decrementq(Address dst, int value) {
616  if (value == min_jint) { subq(dst, value); return; }
617  if (value <  0) { incrementq(dst, -value); return; }
618  if (value == 0) {                        ; return; }
619  if (value == 1 && UseIncDec) { decq(dst) ; return; }
620  /* else */      { subq(dst, value)       ; return; }
621}
622
623void MacroAssembler::incrementq(AddressLiteral dst) {
624  if (reachable(dst)) {
625    incrementq(as_Address(dst));
626  } else {
627    lea(rscratch1, dst);
628    incrementq(Address(rscratch1, 0));
629  }
630}
631
632void MacroAssembler::incrementq(Register reg, int value) {
633  if (value == min_jint) { addq(reg, value); return; }
634  if (value <  0) { decrementq(reg, -value); return; }
635  if (value == 0) {                        ; return; }
636  if (value == 1 && UseIncDec) { incq(reg) ; return; }
637  /* else */      { addq(reg, value)       ; return; }
638}
639
640void MacroAssembler::incrementq(Address dst, int value) {
641  if (value == min_jint) { addq(dst, value); return; }
642  if (value <  0) { decrementq(dst, -value); return; }
643  if (value == 0) {                        ; return; }
644  if (value == 1 && UseIncDec) { incq(dst) ; return; }
645  /* else */      { addq(dst, value)       ; return; }
646}
647
648// 32bit can do a case table jump in one instruction but we no longer allow the base
649// to be installed in the Address class
650void MacroAssembler::jump(ArrayAddress entry) {
651  lea(rscratch1, entry.base());
652  Address dispatch = entry.index();
653  assert(dispatch._base == noreg, "must be");
654  dispatch._base = rscratch1;
655  jmp(dispatch);
656}
657
658void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
659  ShouldNotReachHere(); // 64bit doesn't use two regs
660  cmpq(x_lo, y_lo);
661}
662
663void MacroAssembler::lea(Register dst, AddressLiteral src) {
664    mov_literal64(dst, (intptr_t)src.target(), src.rspec());
665}
666
667void MacroAssembler::lea(Address dst, AddressLiteral adr) {
668  mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
669  movptr(dst, rscratch1);
670}
671
672void MacroAssembler::leave() {
673  // %%% is this really better? Why not on 32bit too?
674  emit_int8((unsigned char)0xC9); // LEAVE
675}
676
677void MacroAssembler::lneg(Register hi, Register lo) {
678  ShouldNotReachHere(); // 64bit doesn't use two regs
679  negq(lo);
680}
681
682void MacroAssembler::movoop(Register dst, jobject obj) {
683  mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
684}
685
686void MacroAssembler::movoop(Address dst, jobject obj) {
687  mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
688  movq(dst, rscratch1);
689}
690
691void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
692  mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
693}
694
695void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
696  mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
697  movq(dst, rscratch1);
698}
699
700void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
701  if (src.is_lval()) {
702    mov_literal64(dst, (intptr_t)src.target(), src.rspec());
703  } else {
704    if (reachable(src)) {
705      movq(dst, as_Address(src));
706    } else {
707      lea(scratch, src);
708      movq(dst, Address(scratch, 0));
709    }
710  }
711}
712
713void MacroAssembler::movptr(ArrayAddress dst, Register src) {
714  movq(as_Address(dst), src);
715}
716
717void MacroAssembler::movptr(Register dst, ArrayAddress src) {
718  movq(dst, as_Address(src));
719}
720
721// src should NEVER be a real pointer. Use AddressLiteral for true pointers
722void MacroAssembler::movptr(Address dst, intptr_t src) {
723  mov64(rscratch1, src);
724  movq(dst, rscratch1);
725}
726
727// These are mostly for initializing NULL
728void MacroAssembler::movptr(Address dst, int32_t src) {
729  movslq(dst, src);
730}
731
732void MacroAssembler::movptr(Register dst, int32_t src) {
733  mov64(dst, (intptr_t)src);
734}
735
736void MacroAssembler::pushoop(jobject obj) {
737  movoop(rscratch1, obj);
738  push(rscratch1);
739}
740
741void MacroAssembler::pushklass(Metadata* obj) {
742  mov_metadata(rscratch1, obj);
743  push(rscratch1);
744}
745
746void MacroAssembler::pushptr(AddressLiteral src) {
747  lea(rscratch1, src);
748  if (src.is_lval()) {
749    push(rscratch1);
750  } else {
751    pushq(Address(rscratch1, 0));
752  }
753}
754
755void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
756  // we must set sp to zero to clear frame
757  movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
758  // must clear fp, so that compiled frames are not confused; it is
759  // possible that we need it only for debugging
760  if (clear_fp) {
761    movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
762  }
763
764  // Always clear the pc because it could have been set by make_walkable()
765  movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
766}
767
768void MacroAssembler::set_last_Java_frame(Register last_java_sp,
769                                         Register last_java_fp,
770                                         address  last_java_pc) {
771  // determine last_java_sp register
772  if (!last_java_sp->is_valid()) {
773    last_java_sp = rsp;
774  }
775
776  // last_java_fp is optional
777  if (last_java_fp->is_valid()) {
778    movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
779           last_java_fp);
780  }
781
782  // last_java_pc is optional
783  if (last_java_pc != NULL) {
784    Address java_pc(r15_thread,
785                    JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
786    lea(rscratch1, InternalAddress(last_java_pc));
787    movptr(java_pc, rscratch1);
788  }
789
790  movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
791}
792
793static void pass_arg0(MacroAssembler* masm, Register arg) {
794  if (c_rarg0 != arg ) {
795    masm->mov(c_rarg0, arg);
796  }
797}
798
799static void pass_arg1(MacroAssembler* masm, Register arg) {
800  if (c_rarg1 != arg ) {
801    masm->mov(c_rarg1, arg);
802  }
803}
804
805static void pass_arg2(MacroAssembler* masm, Register arg) {
806  if (c_rarg2 != arg ) {
807    masm->mov(c_rarg2, arg);
808  }
809}
810
811static void pass_arg3(MacroAssembler* masm, Register arg) {
812  if (c_rarg3 != arg ) {
813    masm->mov(c_rarg3, arg);
814  }
815}
816
817void MacroAssembler::stop(const char* msg) {
818  address rip = pc();
819  pusha(); // get regs on stack
820  lea(c_rarg0, ExternalAddress((address) msg));
821  lea(c_rarg1, InternalAddress(rip));
822  movq(c_rarg2, rsp); // pass pointer to regs array
823  andq(rsp, -16); // align stack as required by ABI
824  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
825  hlt();
826}
827
828void MacroAssembler::warn(const char* msg) {
829  push(rbp);
830  movq(rbp, rsp);
831  andq(rsp, -16);     // align stack as required by push_CPU_state and call
832  push_CPU_state();   // keeps alignment at 16 bytes
833  lea(c_rarg0, ExternalAddress((address) msg));
834  call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0);
835  pop_CPU_state();
836  mov(rsp, rbp);
837  pop(rbp);
838}
839
840void MacroAssembler::print_state() {
841  address rip = pc();
842  pusha();            // get regs on stack
843  push(rbp);
844  movq(rbp, rsp);
845  andq(rsp, -16);     // align stack as required by push_CPU_state and call
846  push_CPU_state();   // keeps alignment at 16 bytes
847
848  lea(c_rarg0, InternalAddress(rip));
849  lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
850  call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
851
852  pop_CPU_state();
853  mov(rsp, rbp);
854  pop(rbp);
855  popa();
856}
857
858#ifndef PRODUCT
859extern "C" void findpc(intptr_t x);
860#endif
861
862void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
863  // In order to get locks to work, we need to fake a in_VM state
864  if (ShowMessageBoxOnError) {
865    JavaThread* thread = JavaThread::current();
866    JavaThreadState saved_state = thread->thread_state();
867    thread->set_thread_state(_thread_in_vm);
868#ifndef PRODUCT
869    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
870      ttyLocker ttyl;
871      BytecodeCounter::print();
872    }
873#endif
874    // To see where a verify_oop failed, get $ebx+40/X for this frame.
875    // XXX correct this offset for amd64
876    // This is the value of eip which points to where verify_oop will return.
877    if (os::message_box(msg, "Execution stopped, print registers?")) {
878      print_state64(pc, regs);
879      BREAKPOINT;
880      assert(false, "start up GDB");
881    }
882    ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
883  } else {
884    ttyLocker ttyl;
885    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
886                    msg);
887    assert(false, "DEBUG MESSAGE: %s", msg);
888  }
889}
890
891void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
892  ttyLocker ttyl;
893  FlagSetting fs(Debugging, true);
894  tty->print_cr("rip = 0x%016lx", pc);
895#ifndef PRODUCT
896  tty->cr();
897  findpc(pc);
898  tty->cr();
899#endif
900#define PRINT_REG(rax, value) \
901  { tty->print("%s = ", #rax); os::print_location(tty, value); }
902  PRINT_REG(rax, regs[15]);
903  PRINT_REG(rbx, regs[12]);
904  PRINT_REG(rcx, regs[14]);
905  PRINT_REG(rdx, regs[13]);
906  PRINT_REG(rdi, regs[8]);
907  PRINT_REG(rsi, regs[9]);
908  PRINT_REG(rbp, regs[10]);
909  PRINT_REG(rsp, regs[11]);
910  PRINT_REG(r8 , regs[7]);
911  PRINT_REG(r9 , regs[6]);
912  PRINT_REG(r10, regs[5]);
913  PRINT_REG(r11, regs[4]);
914  PRINT_REG(r12, regs[3]);
915  PRINT_REG(r13, regs[2]);
916  PRINT_REG(r14, regs[1]);
917  PRINT_REG(r15, regs[0]);
918#undef PRINT_REG
919  // Print some words near top of staack.
920  int64_t* rsp = (int64_t*) regs[11];
921  int64_t* dump_sp = rsp;
922  for (int col1 = 0; col1 < 8; col1++) {
923    tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
924    os::print_location(tty, *dump_sp++);
925  }
926  for (int row = 0; row < 25; row++) {
927    tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
928    for (int col = 0; col < 4; col++) {
929      tty->print(" 0x%016lx", *dump_sp++);
930    }
931    tty->cr();
932  }
933  // Print some instructions around pc:
934  Disassembler::decode((address)pc-64, (address)pc);
935  tty->print_cr("--------");
936  Disassembler::decode((address)pc, (address)pc+32);
937}
938
939#endif // _LP64
940
941// Now versions that are common to 32/64 bit
942
943void MacroAssembler::addptr(Register dst, int32_t imm32) {
944  LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
945}
946
947void MacroAssembler::addptr(Register dst, Register src) {
948  LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
949}
950
951void MacroAssembler::addptr(Address dst, Register src) {
952  LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
953}
954
955void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
956  if (reachable(src)) {
957    Assembler::addsd(dst, as_Address(src));
958  } else {
959    lea(rscratch1, src);
960    Assembler::addsd(dst, Address(rscratch1, 0));
961  }
962}
963
964void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
965  if (reachable(src)) {
966    addss(dst, as_Address(src));
967  } else {
968    lea(rscratch1, src);
969    addss(dst, Address(rscratch1, 0));
970  }
971}
972
973void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
974  if (reachable(src)) {
975    Assembler::addpd(dst, as_Address(src));
976  } else {
977    lea(rscratch1, src);
978    Assembler::addpd(dst, Address(rscratch1, 0));
979  }
980}
981
982void MacroAssembler::align(int modulus) {
983  align(modulus, offset());
984}
985
986void MacroAssembler::align(int modulus, int target) {
987  if (target % modulus != 0) {
988    nop(modulus - (target % modulus));
989  }
990}
991
992void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
993  // Used in sign-masking with aligned address.
994  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
995  if (reachable(src)) {
996    Assembler::andpd(dst, as_Address(src));
997  } else {
998    lea(rscratch1, src);
999    Assembler::andpd(dst, Address(rscratch1, 0));
1000  }
1001}
1002
1003void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
1004  // Used in sign-masking with aligned address.
1005  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1006  if (reachable(src)) {
1007    Assembler::andps(dst, as_Address(src));
1008  } else {
1009    lea(rscratch1, src);
1010    Assembler::andps(dst, Address(rscratch1, 0));
1011  }
1012}
1013
1014void MacroAssembler::andptr(Register dst, int32_t imm32) {
1015  LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1016}
1017
1018void MacroAssembler::atomic_incl(Address counter_addr) {
1019  if (os::is_MP())
1020    lock();
1021  incrementl(counter_addr);
1022}
1023
1024void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1025  if (reachable(counter_addr)) {
1026    atomic_incl(as_Address(counter_addr));
1027  } else {
1028    lea(scr, counter_addr);
1029    atomic_incl(Address(scr, 0));
1030  }
1031}
1032
1033#ifdef _LP64
1034void MacroAssembler::atomic_incq(Address counter_addr) {
1035  if (os::is_MP())
1036    lock();
1037  incrementq(counter_addr);
1038}
1039
1040void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1041  if (reachable(counter_addr)) {
1042    atomic_incq(as_Address(counter_addr));
1043  } else {
1044    lea(scr, counter_addr);
1045    atomic_incq(Address(scr, 0));
1046  }
1047}
1048#endif
1049
1050// Writes to stack successive pages until offset reached to check for
1051// stack overflow + shadow pages.  This clobbers tmp.
1052void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1053  movptr(tmp, rsp);
1054  // Bang stack for total size given plus shadow page size.
1055  // Bang one page at a time because large size can bang beyond yellow and
1056  // red zones.
1057  Label loop;
1058  bind(loop);
1059  movl(Address(tmp, (-os::vm_page_size())), size );
1060  subptr(tmp, os::vm_page_size());
1061  subl(size, os::vm_page_size());
1062  jcc(Assembler::greater, loop);
1063
1064  // Bang down shadow pages too.
1065  // At this point, (tmp-0) is the last address touched, so don't
1066  // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1067  // was post-decremented.)  Skip this address by starting at i=1, and
1068  // touch a few more pages below.  N.B.  It is important to touch all
1069  // the way down including all pages in the shadow zone.
1070  for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1071    // this could be any sized move but this is can be a debugging crumb
1072    // so the bigger the better.
1073    movptr(Address(tmp, (-i*os::vm_page_size())), size );
1074  }
1075}
1076
1077void MacroAssembler::reserved_stack_check() {
1078    // testing if reserved zone needs to be enabled
1079    Label no_reserved_zone_enabling;
1080    Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1081    NOT_LP64(get_thread(rsi);)
1082
1083    cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1084    jcc(Assembler::below, no_reserved_zone_enabling);
1085
1086    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1087    jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1088    should_not_reach_here();
1089
1090    bind(no_reserved_zone_enabling);
1091}
1092
1093int MacroAssembler::biased_locking_enter(Register lock_reg,
1094                                         Register obj_reg,
1095                                         Register swap_reg,
1096                                         Register tmp_reg,
1097                                         bool swap_reg_contains_mark,
1098                                         Label& done,
1099                                         Label* slow_case,
1100                                         BiasedLockingCounters* counters) {
1101  assert(UseBiasedLocking, "why call this otherwise?");
1102  assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1103  assert(tmp_reg != noreg, "tmp_reg must be supplied");
1104  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1105  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
1106  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1107  NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
1108
1109  if (PrintBiasedLockingStatistics && counters == NULL) {
1110    counters = BiasedLocking::counters();
1111  }
1112  // Biased locking
1113  // See whether the lock is currently biased toward our thread and
1114  // whether the epoch is still valid
1115  // Note that the runtime guarantees sufficient alignment of JavaThread
1116  // pointers to allow age to be placed into low bits
1117  // First check to see whether biasing is even enabled for this object
1118  Label cas_label;
1119  int null_check_offset = -1;
1120  if (!swap_reg_contains_mark) {
1121    null_check_offset = offset();
1122    movptr(swap_reg, mark_addr);
1123  }
1124  movptr(tmp_reg, swap_reg);
1125  andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
1126  cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
1127  jcc(Assembler::notEqual, cas_label);
1128  // The bias pattern is present in the object's header. Need to check
1129  // whether the bias owner and the epoch are both still current.
1130#ifndef _LP64
1131  // Note that because there is no current thread register on x86_32 we
1132  // need to store off the mark word we read out of the object to
1133  // avoid reloading it and needing to recheck invariants below. This
1134  // store is unfortunate but it makes the overall code shorter and
1135  // simpler.
1136  movptr(saved_mark_addr, swap_reg);
1137#endif
1138  if (swap_reg_contains_mark) {
1139    null_check_offset = offset();
1140  }
1141  load_prototype_header(tmp_reg, obj_reg);
1142#ifdef _LP64
1143  orptr(tmp_reg, r15_thread);
1144  xorptr(tmp_reg, swap_reg);
1145  Register header_reg = tmp_reg;
1146#else
1147  xorptr(tmp_reg, swap_reg);
1148  get_thread(swap_reg);
1149  xorptr(swap_reg, tmp_reg);
1150  Register header_reg = swap_reg;
1151#endif
1152  andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
1153  if (counters != NULL) {
1154    cond_inc32(Assembler::zero,
1155               ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1156  }
1157  jcc(Assembler::equal, done);
1158
1159  Label try_revoke_bias;
1160  Label try_rebias;
1161
1162  // At this point we know that the header has the bias pattern and
1163  // that we are not the bias owner in the current epoch. We need to
1164  // figure out more details about the state of the header in order to
1165  // know what operations can be legally performed on the object's
1166  // header.
1167
1168  // If the low three bits in the xor result aren't clear, that means
1169  // the prototype header is no longer biased and we have to revoke
1170  // the bias on this object.
1171  testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
1172  jccb(Assembler::notZero, try_revoke_bias);
1173
1174  // Biasing is still enabled for this data type. See whether the
1175  // epoch of the current bias is still valid, meaning that the epoch
1176  // bits of the mark word are equal to the epoch bits of the
1177  // prototype header. (Note that the prototype header's epoch bits
1178  // only change at a safepoint.) If not, attempt to rebias the object
1179  // toward the current thread. Note that we must be absolutely sure
1180  // that the current epoch is invalid in order to do this because
1181  // otherwise the manipulations it performs on the mark word are
1182  // illegal.
1183  testptr(header_reg, markOopDesc::epoch_mask_in_place);
1184  jccb(Assembler::notZero, try_rebias);
1185
1186  // The epoch of the current bias is still valid but we know nothing
1187  // about the owner; it might be set or it might be clear. Try to
1188  // acquire the bias of the object using an atomic operation. If this
1189  // fails we will go in to the runtime to revoke the object's bias.
1190  // Note that we first construct the presumed unbiased header so we
1191  // don't accidentally blow away another thread's valid bias.
1192  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1193  andptr(swap_reg,
1194         markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
1195#ifdef _LP64
1196  movptr(tmp_reg, swap_reg);
1197  orptr(tmp_reg, r15_thread);
1198#else
1199  get_thread(tmp_reg);
1200  orptr(tmp_reg, swap_reg);
1201#endif
1202  if (os::is_MP()) {
1203    lock();
1204  }
1205  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1206  // If the biasing toward our thread failed, this means that
1207  // another thread succeeded in biasing it toward itself and we
1208  // need to revoke that bias. The revocation will occur in the
1209  // interpreter runtime in the slow case.
1210  if (counters != NULL) {
1211    cond_inc32(Assembler::zero,
1212               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1213  }
1214  if (slow_case != NULL) {
1215    jcc(Assembler::notZero, *slow_case);
1216  }
1217  jmp(done);
1218
1219  bind(try_rebias);
1220  // At this point we know the epoch has expired, meaning that the
1221  // current "bias owner", if any, is actually invalid. Under these
1222  // circumstances _only_, we are allowed to use the current header's
1223  // value as the comparison value when doing the cas to acquire the
1224  // bias in the current epoch. In other words, we allow transfer of
1225  // the bias from one thread to another directly in this situation.
1226  //
1227  // FIXME: due to a lack of registers we currently blow away the age
1228  // bits in this situation. Should attempt to preserve them.
1229  load_prototype_header(tmp_reg, obj_reg);
1230#ifdef _LP64
1231  orptr(tmp_reg, r15_thread);
1232#else
1233  get_thread(swap_reg);
1234  orptr(tmp_reg, swap_reg);
1235  movptr(swap_reg, saved_mark_addr);
1236#endif
1237  if (os::is_MP()) {
1238    lock();
1239  }
1240  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1241  // If the biasing toward our thread failed, then another thread
1242  // succeeded in biasing it toward itself and we need to revoke that
1243  // bias. The revocation will occur in the runtime in the slow case.
1244  if (counters != NULL) {
1245    cond_inc32(Assembler::zero,
1246               ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1247  }
1248  if (slow_case != NULL) {
1249    jcc(Assembler::notZero, *slow_case);
1250  }
1251  jmp(done);
1252
1253  bind(try_revoke_bias);
1254  // The prototype mark in the klass doesn't have the bias bit set any
1255  // more, indicating that objects of this data type are not supposed
1256  // to be biased any more. We are going to try to reset the mark of
1257  // this object to the prototype value and fall through to the
1258  // CAS-based locking scheme. Note that if our CAS fails, it means
1259  // that another thread raced us for the privilege of revoking the
1260  // bias of this particular object, so it's okay to continue in the
1261  // normal locking code.
1262  //
1263  // FIXME: due to a lack of registers we currently blow away the age
1264  // bits in this situation. Should attempt to preserve them.
1265  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1266  load_prototype_header(tmp_reg, obj_reg);
1267  if (os::is_MP()) {
1268    lock();
1269  }
1270  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1271  // Fall through to the normal CAS-based lock, because no matter what
1272  // the result of the above CAS, some thread must have succeeded in
1273  // removing the bias bit from the object's header.
1274  if (counters != NULL) {
1275    cond_inc32(Assembler::zero,
1276               ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1277  }
1278
1279  bind(cas_label);
1280
1281  return null_check_offset;
1282}
1283
1284void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1285  assert(UseBiasedLocking, "why call this otherwise?");
1286
1287  // Check for biased locking unlock case, which is a no-op
1288  // Note: we do not have to check the thread ID for two reasons.
1289  // First, the interpreter checks for IllegalMonitorStateException at
1290  // a higher level. Second, if the bias was revoked while we held the
1291  // lock, the object could not be rebiased toward another thread, so
1292  // the bias bit would be clear.
1293  movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1294  andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
1295  cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
1296  jcc(Assembler::equal, done);
1297}
1298
1299#ifdef COMPILER2
1300
1301#if INCLUDE_RTM_OPT
1302
1303// Update rtm_counters based on abort status
1304// input: abort_status
1305//        rtm_counters (RTMLockingCounters*)
1306// flags are killed
1307void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1308
1309  atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1310  if (PrintPreciseRTMLockingStatistics) {
1311    for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1312      Label check_abort;
1313      testl(abort_status, (1<<i));
1314      jccb(Assembler::equal, check_abort);
1315      atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1316      bind(check_abort);
1317    }
1318  }
1319}
1320
1321// Branch if (random & (count-1) != 0), count is 2^n
1322// tmp, scr and flags are killed
1323void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
1324  assert(tmp == rax, "");
1325  assert(scr == rdx, "");
1326  rdtsc(); // modifies EDX:EAX
1327  andptr(tmp, count-1);
1328  jccb(Assembler::notZero, brLabel);
1329}
1330
1331// Perform abort ratio calculation, set no_rtm bit if high ratio
1332// input:  rtm_counters_Reg (RTMLockingCounters* address)
1333// tmpReg, rtm_counters_Reg and flags are killed
1334void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
1335                                                 Register rtm_counters_Reg,
1336                                                 RTMLockingCounters* rtm_counters,
1337                                                 Metadata* method_data) {
1338  Label L_done, L_check_always_rtm1, L_check_always_rtm2;
1339
1340  if (RTMLockingCalculationDelay > 0) {
1341    // Delay calculation
1342    movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
1343    testptr(tmpReg, tmpReg);
1344    jccb(Assembler::equal, L_done);
1345  }
1346  // Abort ratio calculation only if abort_count > RTMAbortThreshold
1347  //   Aborted transactions = abort_count * 100
1348  //   All transactions = total_count *  RTMTotalCountIncrRate
1349  //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
1350
1351  movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
1352  cmpptr(tmpReg, RTMAbortThreshold);
1353  jccb(Assembler::below, L_check_always_rtm2);
1354  imulptr(tmpReg, tmpReg, 100);
1355
1356  Register scrReg = rtm_counters_Reg;
1357  movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1358  imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
1359  imulptr(scrReg, scrReg, RTMAbortRatio);
1360  cmpptr(tmpReg, scrReg);
1361  jccb(Assembler::below, L_check_always_rtm1);
1362  if (method_data != NULL) {
1363    // set rtm_state to "no rtm" in MDO
1364    mov_metadata(tmpReg, method_data);
1365    if (os::is_MP()) {
1366      lock();
1367    }
1368    orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
1369  }
1370  jmpb(L_done);
1371  bind(L_check_always_rtm1);
1372  // Reload RTMLockingCounters* address
1373  lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1374  bind(L_check_always_rtm2);
1375  movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1376  cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
1377  jccb(Assembler::below, L_done);
1378  if (method_data != NULL) {
1379    // set rtm_state to "always rtm" in MDO
1380    mov_metadata(tmpReg, method_data);
1381    if (os::is_MP()) {
1382      lock();
1383    }
1384    orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
1385  }
1386  bind(L_done);
1387}
1388
1389// Update counters and perform abort ratio calculation
1390// input:  abort_status_Reg
1391// rtm_counters_Reg, flags are killed
1392void MacroAssembler::rtm_profiling(Register abort_status_Reg,
1393                                   Register rtm_counters_Reg,
1394                                   RTMLockingCounters* rtm_counters,
1395                                   Metadata* method_data,
1396                                   bool profile_rtm) {
1397
1398  assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1399  // update rtm counters based on rax value at abort
1400  // reads abort_status_Reg, updates flags
1401  lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1402  rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
1403  if (profile_rtm) {
1404    // Save abort status because abort_status_Reg is used by following code.
1405    if (RTMRetryCount > 0) {
1406      push(abort_status_Reg);
1407    }
1408    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1409    rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
1410    // restore abort status
1411    if (RTMRetryCount > 0) {
1412      pop(abort_status_Reg);
1413    }
1414  }
1415}
1416
1417// Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
1418// inputs: retry_count_Reg
1419//       : abort_status_Reg
1420// output: retry_count_Reg decremented by 1
1421// flags are killed
1422void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
1423  Label doneRetry;
1424  assert(abort_status_Reg == rax, "");
1425  // The abort reason bits are in eax (see all states in rtmLocking.hpp)
1426  // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
1427  // if reason is in 0x6 and retry count != 0 then retry
1428  andptr(abort_status_Reg, 0x6);
1429  jccb(Assembler::zero, doneRetry);
1430  testl(retry_count_Reg, retry_count_Reg);
1431  jccb(Assembler::zero, doneRetry);
1432  pause();
1433  decrementl(retry_count_Reg);
1434  jmp(retryLabel);
1435  bind(doneRetry);
1436}
1437
1438// Spin and retry if lock is busy,
1439// inputs: box_Reg (monitor address)
1440//       : retry_count_Reg
1441// output: retry_count_Reg decremented by 1
1442//       : clear z flag if retry count exceeded
1443// tmp_Reg, scr_Reg, flags are killed
1444void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
1445                                            Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
1446  Label SpinLoop, SpinExit, doneRetry;
1447  int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1448
1449  testl(retry_count_Reg, retry_count_Reg);
1450  jccb(Assembler::zero, doneRetry);
1451  decrementl(retry_count_Reg);
1452  movptr(scr_Reg, RTMSpinLoopCount);
1453
1454  bind(SpinLoop);
1455  pause();
1456  decrementl(scr_Reg);
1457  jccb(Assembler::lessEqual, SpinExit);
1458  movptr(tmp_Reg, Address(box_Reg, owner_offset));
1459  testptr(tmp_Reg, tmp_Reg);
1460  jccb(Assembler::notZero, SpinLoop);
1461
1462  bind(SpinExit);
1463  jmp(retryLabel);
1464  bind(doneRetry);
1465  incrementl(retry_count_Reg); // clear z flag
1466}
1467
1468// Use RTM for normal stack locks
1469// Input: objReg (object to lock)
1470void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
1471                                       Register retry_on_abort_count_Reg,
1472                                       RTMLockingCounters* stack_rtm_counters,
1473                                       Metadata* method_data, bool profile_rtm,
1474                                       Label& DONE_LABEL, Label& IsInflated) {
1475  assert(UseRTMForStackLocks, "why call this otherwise?");
1476  assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1477  assert(tmpReg == rax, "");
1478  assert(scrReg == rdx, "");
1479  Label L_rtm_retry, L_decrement_retry, L_on_abort;
1480
1481  if (RTMRetryCount > 0) {
1482    movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1483    bind(L_rtm_retry);
1484  }
1485  movptr(tmpReg, Address(objReg, 0));
1486  testptr(tmpReg, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
1487  jcc(Assembler::notZero, IsInflated);
1488
1489  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1490    Label L_noincrement;
1491    if (RTMTotalCountIncrRate > 1) {
1492      // tmpReg, scrReg and flags are killed
1493      branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
1494    }
1495    assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
1496    atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
1497    bind(L_noincrement);
1498  }
1499  xbegin(L_on_abort);
1500  movptr(tmpReg, Address(objReg, 0));       // fetch markword
1501  andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1502  cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1503  jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
1504
1505  Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1506  if (UseRTMXendForLockBusy) {
1507    xend();
1508    movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
1509    jmp(L_decrement_retry);
1510  }
1511  else {
1512    xabort(0);
1513  }
1514  bind(L_on_abort);
1515  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1516    rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
1517  }
1518  bind(L_decrement_retry);
1519  if (RTMRetryCount > 0) {
1520    // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1521    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1522  }
1523}
1524
1525// Use RTM for inflating locks
1526// inputs: objReg (object to lock)
1527//         boxReg (on-stack box address (displaced header location) - KILLED)
1528//         tmpReg (ObjectMonitor address + markOopDesc::monitor_value)
1529void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1530                                          Register scrReg, Register retry_on_busy_count_Reg,
1531                                          Register retry_on_abort_count_Reg,
1532                                          RTMLockingCounters* rtm_counters,
1533                                          Metadata* method_data, bool profile_rtm,
1534                                          Label& DONE_LABEL) {
1535  assert(UseRTMLocking, "why call this otherwise?");
1536  assert(tmpReg == rax, "");
1537  assert(scrReg == rdx, "");
1538  Label L_rtm_retry, L_decrement_retry, L_on_abort;
1539  int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1540
1541  // Without cast to int32_t a movptr will destroy r10 which is typically obj
1542  movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1543  movptr(boxReg, tmpReg); // Save ObjectMonitor address
1544
1545  if (RTMRetryCount > 0) {
1546    movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
1547    movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1548    bind(L_rtm_retry);
1549  }
1550  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1551    Label L_noincrement;
1552    if (RTMTotalCountIncrRate > 1) {
1553      // tmpReg, scrReg and flags are killed
1554      branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
1555    }
1556    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1557    atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1558    bind(L_noincrement);
1559  }
1560  xbegin(L_on_abort);
1561  movptr(tmpReg, Address(objReg, 0));
1562  movptr(tmpReg, Address(tmpReg, owner_offset));
1563  testptr(tmpReg, tmpReg);
1564  jcc(Assembler::zero, DONE_LABEL);
1565  if (UseRTMXendForLockBusy) {
1566    xend();
1567    jmp(L_decrement_retry);
1568  }
1569  else {
1570    xabort(0);
1571  }
1572  bind(L_on_abort);
1573  Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1574  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1575    rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1576  }
1577  if (RTMRetryCount > 0) {
1578    // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1579    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1580  }
1581
1582  movptr(tmpReg, Address(boxReg, owner_offset)) ;
1583  testptr(tmpReg, tmpReg) ;
1584  jccb(Assembler::notZero, L_decrement_retry) ;
1585
1586  // Appears unlocked - try to swing _owner from null to non-null.
1587  // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1588#ifdef _LP64
1589  Register threadReg = r15_thread;
1590#else
1591  get_thread(scrReg);
1592  Register threadReg = scrReg;
1593#endif
1594  if (os::is_MP()) {
1595    lock();
1596  }
1597  cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1598
1599  if (RTMRetryCount > 0) {
1600    // success done else retry
1601    jccb(Assembler::equal, DONE_LABEL) ;
1602    bind(L_decrement_retry);
1603    // Spin and retry if lock is busy.
1604    rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1605  }
1606  else {
1607    bind(L_decrement_retry);
1608  }
1609}
1610
1611#endif //  INCLUDE_RTM_OPT
1612
1613// Fast_Lock and Fast_Unlock used by C2
1614
1615// Because the transitions from emitted code to the runtime
1616// monitorenter/exit helper stubs are so slow it's critical that
1617// we inline both the stack-locking fast-path and the inflated fast path.
1618//
1619// See also: cmpFastLock and cmpFastUnlock.
1620//
1621// What follows is a specialized inline transliteration of the code
1622// in slow_enter() and slow_exit().  If we're concerned about I$ bloat
1623// another option would be to emit TrySlowEnter and TrySlowExit methods
1624// at startup-time.  These methods would accept arguments as
1625// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1626// indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
1627// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1628// In practice, however, the # of lock sites is bounded and is usually small.
1629// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1630// if the processor uses simple bimodal branch predictors keyed by EIP
1631// Since the helper routines would be called from multiple synchronization
1632// sites.
1633//
1634// An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1635// in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1636// to those specialized methods.  That'd give us a mostly platform-independent
1637// implementation that the JITs could optimize and inline at their pleasure.
1638// Done correctly, the only time we'd need to cross to native could would be
1639// to park() or unpark() threads.  We'd also need a few more unsafe operators
1640// to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1641// (b) explicit barriers or fence operations.
1642//
1643// TODO:
1644//
1645// *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1646//    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1647//    Given TLAB allocation, Self is usually manifested in a register, so passing it into
1648//    the lock operators would typically be faster than reifying Self.
1649//
1650// *  Ideally I'd define the primitives as:
1651//       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1652//       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1653//    Unfortunately ADLC bugs prevent us from expressing the ideal form.
1654//    Instead, we're stuck with a rather awkward and brittle register assignments below.
1655//    Furthermore the register assignments are overconstrained, possibly resulting in
1656//    sub-optimal code near the synchronization site.
1657//
1658// *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
1659//    Alternately, use a better sp-proximity test.
1660//
1661// *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1662//    Either one is sufficient to uniquely identify a thread.
1663//    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1664//
1665// *  Intrinsify notify() and notifyAll() for the common cases where the
1666//    object is locked by the calling thread but the waitlist is empty.
1667//    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1668//
1669// *  use jccb and jmpb instead of jcc and jmp to improve code density.
1670//    But beware of excessive branch density on AMD Opterons.
1671//
1672// *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1673//    or failure of the fast-path.  If the fast-path fails then we pass
1674//    control to the slow-path, typically in C.  In Fast_Lock and
1675//    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1676//    will emit a conditional branch immediately after the node.
1677//    So we have branches to branches and lots of ICC.ZF games.
1678//    Instead, it might be better to have C2 pass a "FailureLabel"
1679//    into Fast_Lock and Fast_Unlock.  In the case of success, control
1680//    will drop through the node.  ICC.ZF is undefined at exit.
1681//    In the case of failure, the node will branch directly to the
1682//    FailureLabel
1683
1684
1685// obj: object to lock
1686// box: on-stack box address (displaced header location) - KILLED
1687// rax,: tmp -- KILLED
1688// scr: tmp -- KILLED
1689void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1690                               Register scrReg, Register cx1Reg, Register cx2Reg,
1691                               BiasedLockingCounters* counters,
1692                               RTMLockingCounters* rtm_counters,
1693                               RTMLockingCounters* stack_rtm_counters,
1694                               Metadata* method_data,
1695                               bool use_rtm, bool profile_rtm) {
1696  // Ensure the register assignments are disjoint
1697  assert(tmpReg == rax, "");
1698
1699  if (use_rtm) {
1700    assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1701  } else {
1702    assert(cx1Reg == noreg, "");
1703    assert(cx2Reg == noreg, "");
1704    assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1705  }
1706
1707  if (counters != NULL) {
1708    atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1709  }
1710  if (EmitSync & 1) {
1711      // set box->dhw = markOopDesc::unused_mark()
1712      // Force all sync thru slow-path: slow_enter() and slow_exit()
1713      movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1714      cmpptr (rsp, (int32_t)NULL_WORD);
1715  } else {
1716    // Possible cases that we'll encounter in fast_lock
1717    // ------------------------------------------------
1718    // * Inflated
1719    //    -- unlocked
1720    //    -- Locked
1721    //       = by self
1722    //       = by other
1723    // * biased
1724    //    -- by Self
1725    //    -- by other
1726    // * neutral
1727    // * stack-locked
1728    //    -- by self
1729    //       = sp-proximity test hits
1730    //       = sp-proximity test generates false-negative
1731    //    -- by other
1732    //
1733
1734    Label IsInflated, DONE_LABEL;
1735
1736    // it's stack-locked, biased or neutral
1737    // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1738    // order to reduce the number of conditional branches in the most common cases.
1739    // Beware -- there's a subtle invariant that fetch of the markword
1740    // at [FETCH], below, will never observe a biased encoding (*101b).
1741    // If this invariant is not held we risk exclusion (safety) failure.
1742    if (UseBiasedLocking && !UseOptoBiasInlining) {
1743      biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1744    }
1745
1746#if INCLUDE_RTM_OPT
1747    if (UseRTMForStackLocks && use_rtm) {
1748      rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
1749                        stack_rtm_counters, method_data, profile_rtm,
1750                        DONE_LABEL, IsInflated);
1751    }
1752#endif // INCLUDE_RTM_OPT
1753
1754    movptr(tmpReg, Address(objReg, 0));          // [FETCH]
1755    testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1756    jccb(Assembler::notZero, IsInflated);
1757
1758    // Attempt stack-locking ...
1759    orptr (tmpReg, markOopDesc::unlocked_value);
1760    movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
1761    if (os::is_MP()) {
1762      lock();
1763    }
1764    cmpxchgptr(boxReg, Address(objReg, 0));      // Updates tmpReg
1765    if (counters != NULL) {
1766      cond_inc32(Assembler::equal,
1767                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1768    }
1769    jcc(Assembler::equal, DONE_LABEL);           // Success
1770
1771    // Recursive locking.
1772    // The object is stack-locked: markword contains stack pointer to BasicLock.
1773    // Locked by current thread if difference with current SP is less than one page.
1774    subptr(tmpReg, rsp);
1775    // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
1776    andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1777    movptr(Address(boxReg, 0), tmpReg);
1778    if (counters != NULL) {
1779      cond_inc32(Assembler::equal,
1780                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1781    }
1782    jmp(DONE_LABEL);
1783
1784    bind(IsInflated);
1785    // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value
1786
1787#if INCLUDE_RTM_OPT
1788    // Use the same RTM locking code in 32- and 64-bit VM.
1789    if (use_rtm) {
1790      rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1791                           rtm_counters, method_data, profile_rtm, DONE_LABEL);
1792    } else {
1793#endif // INCLUDE_RTM_OPT
1794
1795#ifndef _LP64
1796    // The object is inflated.
1797
1798    // boxReg refers to the on-stack BasicLock in the current frame.
1799    // We'd like to write:
1800    //   set box->_displaced_header = markOopDesc::unused_mark().  Any non-0 value suffices.
1801    // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
1802    // additional latency as we have another ST in the store buffer that must drain.
1803
1804    if (EmitSync & 8192) {
1805       movptr(Address(boxReg, 0), 3);            // results in ST-before-CAS penalty
1806       get_thread (scrReg);
1807       movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2]
1808       movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
1809       if (os::is_MP()) {
1810         lock();
1811       }
1812       cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1813    } else
1814    if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
1815       // register juggle because we need tmpReg for cmpxchgptr below
1816       movptr(scrReg, boxReg);
1817       movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
1818
1819       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1820       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1821          // prefetchw [eax + Offset(_owner)-2]
1822          prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1823       }
1824
1825       if ((EmitSync & 64) == 0) {
1826         // Optimistic form: consider XORL tmpReg,tmpReg
1827         movptr(tmpReg, NULL_WORD);
1828       } else {
1829         // Can suffer RTS->RTO upgrades on shared or cold $ lines
1830         // Test-And-CAS instead of CAS
1831         movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));   // rax, = m->_owner
1832         testptr(tmpReg, tmpReg);                   // Locked ?
1833         jccb  (Assembler::notZero, DONE_LABEL);
1834       }
1835
1836       // Appears unlocked - try to swing _owner from null to non-null.
1837       // Ideally, I'd manifest "Self" with get_thread and then attempt
1838       // to CAS the register containing Self into m->Owner.
1839       // But we don't have enough registers, so instead we can either try to CAS
1840       // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1841       // we later store "Self" into m->Owner.  Transiently storing a stack address
1842       // (rsp or the address of the box) into  m->owner is harmless.
1843       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1844       if (os::is_MP()) {
1845         lock();
1846       }
1847       cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1848       movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1849       // If we weren't able to swing _owner from NULL to the BasicLock
1850       // then take the slow path.
1851       jccb  (Assembler::notZero, DONE_LABEL);
1852       // update _owner from BasicLock to thread
1853       get_thread (scrReg);                    // beware: clobbers ICCs
1854       movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1855       xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1856
1857       // If the CAS fails we can either retry or pass control to the slow-path.
1858       // We use the latter tactic.
1859       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1860       // If the CAS was successful ...
1861       //   Self has acquired the lock
1862       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1863       // Intentional fall-through into DONE_LABEL ...
1864    } else {
1865       movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark()));  // results in ST-before-CAS penalty
1866       movptr(boxReg, tmpReg);
1867
1868       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1869       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1870          // prefetchw [eax + Offset(_owner)-2]
1871          prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1872       }
1873
1874       if ((EmitSync & 64) == 0) {
1875         // Optimistic form
1876         xorptr  (tmpReg, tmpReg);
1877       } else {
1878         // Can suffer RTS->RTO upgrades on shared or cold $ lines
1879         movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));   // rax, = m->_owner
1880         testptr(tmpReg, tmpReg);                   // Locked ?
1881         jccb  (Assembler::notZero, DONE_LABEL);
1882       }
1883
1884       // Appears unlocked - try to swing _owner from null to non-null.
1885       // Use either "Self" (in scr) or rsp as thread identity in _owner.
1886       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1887       get_thread (scrReg);
1888       if (os::is_MP()) {
1889         lock();
1890       }
1891       cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1892
1893       // If the CAS fails we can either retry or pass control to the slow-path.
1894       // We use the latter tactic.
1895       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1896       // If the CAS was successful ...
1897       //   Self has acquired the lock
1898       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1899       // Intentional fall-through into DONE_LABEL ...
1900    }
1901#else // _LP64
1902    // It's inflated
1903    movq(scrReg, tmpReg);
1904    xorq(tmpReg, tmpReg);
1905
1906    if (os::is_MP()) {
1907      lock();
1908    }
1909    cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1910    // Unconditionally set box->_displaced_header = markOopDesc::unused_mark().
1911    // Without cast to int32_t movptr will destroy r10 which is typically obj.
1912    movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1913    // Intentional fall-through into DONE_LABEL ...
1914    // Propagate ICC.ZF from CAS above into DONE_LABEL.
1915#endif // _LP64
1916#if INCLUDE_RTM_OPT
1917    } // use_rtm()
1918#endif
1919    // DONE_LABEL is a hot target - we'd really like to place it at the
1920    // start of cache line by padding with NOPs.
1921    // See the AMD and Intel software optimization manuals for the
1922    // most efficient "long" NOP encodings.
1923    // Unfortunately none of our alignment mechanisms suffice.
1924    bind(DONE_LABEL);
1925
1926    // At DONE_LABEL the icc ZFlag is set as follows ...
1927    // Fast_Unlock uses the same protocol.
1928    // ZFlag == 1 -> Success
1929    // ZFlag == 0 -> Failure - force control through the slow-path
1930  }
1931}
1932
1933// obj: object to unlock
1934// box: box address (displaced header location), killed.  Must be EAX.
1935// tmp: killed, cannot be obj nor box.
1936//
1937// Some commentary on balanced locking:
1938//
1939// Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1940// Methods that don't have provably balanced locking are forced to run in the
1941// interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1942// The interpreter provides two properties:
1943// I1:  At return-time the interpreter automatically and quietly unlocks any
1944//      objects acquired the current activation (frame).  Recall that the
1945//      interpreter maintains an on-stack list of locks currently held by
1946//      a frame.
1947// I2:  If a method attempts to unlock an object that is not held by the
1948//      the frame the interpreter throws IMSX.
1949//
1950// Lets say A(), which has provably balanced locking, acquires O and then calls B().
1951// B() doesn't have provably balanced locking so it runs in the interpreter.
1952// Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1953// is still locked by A().
1954//
1955// The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1956// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1957// should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1958// doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1959// Arguably given that the spec legislates the JNI case as undefined our implementation
1960// could reasonably *avoid* checking owner in Fast_Unlock().
1961// In the interest of performance we elide m->Owner==Self check in unlock.
1962// A perfectly viable alternative is to elide the owner check except when
1963// Xcheck:jni is enabled.
1964
1965void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1966  assert(boxReg == rax, "");
1967  assert_different_registers(objReg, boxReg, tmpReg);
1968
1969  if (EmitSync & 4) {
1970    // Disable - inhibit all inlining.  Force control through the slow-path
1971    cmpptr (rsp, 0);
1972  } else {
1973    Label DONE_LABEL, Stacked, CheckSucc;
1974
1975    // Critically, the biased locking test must have precedence over
1976    // and appear before the (box->dhw == 0) recursive stack-lock test.
1977    if (UseBiasedLocking && !UseOptoBiasInlining) {
1978       biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1979    }
1980
1981#if INCLUDE_RTM_OPT
1982    if (UseRTMForStackLocks && use_rtm) {
1983      assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1984      Label L_regular_unlock;
1985      movptr(tmpReg, Address(objReg, 0));           // fetch markword
1986      andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1987      cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1988      jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
1989      xend();                                       // otherwise end...
1990      jmp(DONE_LABEL);                              // ... and we're done
1991      bind(L_regular_unlock);
1992    }
1993#endif
1994
1995    cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
1996    jcc   (Assembler::zero, DONE_LABEL);            // 0 indicates recursive stack-lock
1997    movptr(tmpReg, Address(objReg, 0));             // Examine the object's markword
1998    testptr(tmpReg, markOopDesc::monitor_value);    // Inflated?
1999    jccb  (Assembler::zero, Stacked);
2000
2001    // It's inflated.
2002#if INCLUDE_RTM_OPT
2003    if (use_rtm) {
2004      Label L_regular_inflated_unlock;
2005      int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
2006      movptr(boxReg, Address(tmpReg, owner_offset));
2007      testptr(boxReg, boxReg);
2008      jccb(Assembler::notZero, L_regular_inflated_unlock);
2009      xend();
2010      jmpb(DONE_LABEL);
2011      bind(L_regular_inflated_unlock);
2012    }
2013#endif
2014
2015    // Despite our balanced locking property we still check that m->_owner == Self
2016    // as java routines or native JNI code called by this thread might
2017    // have released the lock.
2018    // Refer to the comments in synchronizer.cpp for how we might encode extra
2019    // state in _succ so we can avoid fetching EntryList|cxq.
2020    //
2021    // I'd like to add more cases in fast_lock() and fast_unlock() --
2022    // such as recursive enter and exit -- but we have to be wary of
2023    // I$ bloat, T$ effects and BP$ effects.
2024    //
2025    // If there's no contention try a 1-0 exit.  That is, exit without
2026    // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
2027    // we detect and recover from the race that the 1-0 exit admits.
2028    //
2029    // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
2030    // before it STs null into _owner, releasing the lock.  Updates
2031    // to data protected by the critical section must be visible before
2032    // we drop the lock (and thus before any other thread could acquire
2033    // the lock and observe the fields protected by the lock).
2034    // IA32's memory-model is SPO, so STs are ordered with respect to
2035    // each other and there's no need for an explicit barrier (fence).
2036    // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
2037#ifndef _LP64
2038    get_thread (boxReg);
2039    if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
2040      // prefetchw [ebx + Offset(_owner)-2]
2041      prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2042    }
2043
2044    // Note that we could employ various encoding schemes to reduce
2045    // the number of loads below (currently 4) to just 2 or 3.
2046    // Refer to the comments in synchronizer.cpp.
2047    // In practice the chain of fetches doesn't seem to impact performance, however.
2048    xorptr(boxReg, boxReg);
2049    if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
2050       // Attempt to reduce branch density - AMD's branch predictor.
2051       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2052       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2053       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2054       jccb  (Assembler::notZero, DONE_LABEL);
2055       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2056       jmpb  (DONE_LABEL);
2057    } else {
2058       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2059       jccb  (Assembler::notZero, DONE_LABEL);
2060       movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2061       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2062       jccb  (Assembler::notZero, CheckSucc);
2063       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2064       jmpb  (DONE_LABEL);
2065    }
2066
2067    // The Following code fragment (EmitSync & 65536) improves the performance of
2068    // contended applications and contended synchronization microbenchmarks.
2069    // Unfortunately the emission of the code - even though not executed - causes regressions
2070    // in scimark and jetstream, evidently because of $ effects.  Replacing the code
2071    // with an equal number of never-executed NOPs results in the same regression.
2072    // We leave it off by default.
2073
2074    if ((EmitSync & 65536) != 0) {
2075       Label LSuccess, LGoSlowPath ;
2076
2077       bind  (CheckSucc);
2078
2079       // Optional pre-test ... it's safe to elide this
2080       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2081       jccb(Assembler::zero, LGoSlowPath);
2082
2083       // We have a classic Dekker-style idiom:
2084       //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
2085       // There are a number of ways to implement the barrier:
2086       // (1) lock:andl &m->_owner, 0
2087       //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
2088       //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
2089       //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
2090       // (2) If supported, an explicit MFENCE is appealing.
2091       //     In older IA32 processors MFENCE is slower than lock:add or xchg
2092       //     particularly if the write-buffer is full as might be the case if
2093       //     if stores closely precede the fence or fence-equivalent instruction.
2094       //     See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2095       //     as the situation has changed with Nehalem and Shanghai.
2096       // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
2097       //     The $lines underlying the top-of-stack should be in M-state.
2098       //     The locked add instruction is serializing, of course.
2099       // (4) Use xchg, which is serializing
2100       //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
2101       // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
2102       //     The integer condition codes will tell us if succ was 0.
2103       //     Since _succ and _owner should reside in the same $line and
2104       //     we just stored into _owner, it's likely that the $line
2105       //     remains in M-state for the lock:orl.
2106       //
2107       // We currently use (3), although it's likely that switching to (2)
2108       // is correct for the future.
2109
2110       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2111       if (os::is_MP()) {
2112         lock(); addptr(Address(rsp, 0), 0);
2113       }
2114       // Ratify _succ remains non-null
2115       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0);
2116       jccb  (Assembler::notZero, LSuccess);
2117
2118       xorptr(boxReg, boxReg);                  // box is really EAX
2119       if (os::is_MP()) { lock(); }
2120       cmpxchgptr(rsp, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2121       // There's no successor so we tried to regrab the lock with the
2122       // placeholder value. If that didn't work, then another thread
2123       // grabbed the lock so we're done (and exit was a success).
2124       jccb  (Assembler::notEqual, LSuccess);
2125       // Since we're low on registers we installed rsp as a placeholding in _owner.
2126       // Now install Self over rsp.  This is safe as we're transitioning from
2127       // non-null to non=null
2128       get_thread (boxReg);
2129       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), boxReg);
2130       // Intentional fall-through into LGoSlowPath ...
2131
2132       bind  (LGoSlowPath);
2133       orptr(boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2134       jmpb  (DONE_LABEL);
2135
2136       bind  (LSuccess);
2137       xorptr(boxReg, boxReg);                 // set ICC.ZF=1 to indicate success
2138       jmpb  (DONE_LABEL);
2139    }
2140
2141    bind (Stacked);
2142    // It's not inflated and it's not recursively stack-locked and it's not biased.
2143    // It must be stack-locked.
2144    // Try to reset the header to displaced header.
2145    // The "box" value on the stack is stable, so we can reload
2146    // and be assured we observe the same value as above.
2147    movptr(tmpReg, Address(boxReg, 0));
2148    if (os::is_MP()) {
2149      lock();
2150    }
2151    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2152    // Intention fall-thru into DONE_LABEL
2153
2154    // DONE_LABEL is a hot target - we'd really like to place it at the
2155    // start of cache line by padding with NOPs.
2156    // See the AMD and Intel software optimization manuals for the
2157    // most efficient "long" NOP encodings.
2158    // Unfortunately none of our alignment mechanisms suffice.
2159    if ((EmitSync & 65536) == 0) {
2160       bind (CheckSucc);
2161    }
2162#else // _LP64
2163    // It's inflated
2164    if (EmitSync & 1024) {
2165      // Emit code to check that _owner == Self
2166      // We could fold the _owner test into subsequent code more efficiently
2167      // than using a stand-alone check, but since _owner checking is off by
2168      // default we don't bother. We also might consider predicating the
2169      // _owner==Self check on Xcheck:jni or running on a debug build.
2170      movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2171      xorptr(boxReg, r15_thread);
2172    } else {
2173      xorptr(boxReg, boxReg);
2174    }
2175    orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2176    jccb  (Assembler::notZero, DONE_LABEL);
2177    movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2178    orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2179    jccb  (Assembler::notZero, CheckSucc);
2180    movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2181    jmpb  (DONE_LABEL);
2182
2183    if ((EmitSync & 65536) == 0) {
2184      // Try to avoid passing control into the slow_path ...
2185      Label LSuccess, LGoSlowPath ;
2186      bind  (CheckSucc);
2187
2188      // The following optional optimization can be elided if necessary
2189      // Effectively: if (succ == null) goto SlowPath
2190      // The code reduces the window for a race, however,
2191      // and thus benefits performance.
2192      cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2193      jccb  (Assembler::zero, LGoSlowPath);
2194
2195      xorptr(boxReg, boxReg);
2196      if ((EmitSync & 16) && os::is_MP()) {
2197        xchgptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2198      } else {
2199        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2200        if (os::is_MP()) {
2201          // Memory barrier/fence
2202          // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2203          // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2204          // This is faster on Nehalem and AMD Shanghai/Barcelona.
2205          // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2206          // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2207          // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2208          lock(); addl(Address(rsp, 0), 0);
2209        }
2210      }
2211      cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2212      jccb  (Assembler::notZero, LSuccess);
2213
2214      // Rare inopportune interleaving - race.
2215      // The successor vanished in the small window above.
2216      // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2217      // We need to ensure progress and succession.
2218      // Try to reacquire the lock.
2219      // If that fails then the new owner is responsible for succession and this
2220      // thread needs to take no further action and can exit via the fast path (success).
2221      // If the re-acquire succeeds then pass control into the slow path.
2222      // As implemented, this latter mode is horrible because we generated more
2223      // coherence traffic on the lock *and* artifically extended the critical section
2224      // length while by virtue of passing control into the slow path.
2225
2226      // box is really RAX -- the following CMPXCHG depends on that binding
2227      // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2228      if (os::is_MP()) { lock(); }
2229      cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2230      // There's no successor so we tried to regrab the lock.
2231      // If that didn't work, then another thread grabbed the
2232      // lock so we're done (and exit was a success).
2233      jccb  (Assembler::notEqual, LSuccess);
2234      // Intentional fall-through into slow-path
2235
2236      bind  (LGoSlowPath);
2237      orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2238      jmpb  (DONE_LABEL);
2239
2240      bind  (LSuccess);
2241      testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2242      jmpb  (DONE_LABEL);
2243    }
2244
2245    bind  (Stacked);
2246    movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2247    if (os::is_MP()) { lock(); }
2248    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2249
2250    if (EmitSync & 65536) {
2251       bind (CheckSucc);
2252    }
2253#endif
2254    bind(DONE_LABEL);
2255  }
2256}
2257#endif // COMPILER2
2258
2259void MacroAssembler::c2bool(Register x) {
2260  // implements x == 0 ? 0 : 1
2261  // note: must only look at least-significant byte of x
2262  //       since C-style booleans are stored in one byte
2263  //       only! (was bug)
2264  andl(x, 0xFF);
2265  setb(Assembler::notZero, x);
2266}
2267
2268// Wouldn't need if AddressLiteral version had new name
2269void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2270  Assembler::call(L, rtype);
2271}
2272
2273void MacroAssembler::call(Register entry) {
2274  Assembler::call(entry);
2275}
2276
2277void MacroAssembler::call(AddressLiteral entry) {
2278  if (reachable(entry)) {
2279    Assembler::call_literal(entry.target(), entry.rspec());
2280  } else {
2281    lea(rscratch1, entry);
2282    Assembler::call(rscratch1);
2283  }
2284}
2285
2286void MacroAssembler::ic_call(address entry, jint method_index) {
2287  RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
2288  movptr(rax, (intptr_t)Universe::non_oop_word());
2289  call(AddressLiteral(entry, rh));
2290}
2291
2292// Implementation of call_VM versions
2293
2294void MacroAssembler::call_VM(Register oop_result,
2295                             address entry_point,
2296                             bool check_exceptions) {
2297  Label C, E;
2298  call(C, relocInfo::none);
2299  jmp(E);
2300
2301  bind(C);
2302  call_VM_helper(oop_result, entry_point, 0, check_exceptions);
2303  ret(0);
2304
2305  bind(E);
2306}
2307
2308void MacroAssembler::call_VM(Register oop_result,
2309                             address entry_point,
2310                             Register arg_1,
2311                             bool check_exceptions) {
2312  Label C, E;
2313  call(C, relocInfo::none);
2314  jmp(E);
2315
2316  bind(C);
2317  pass_arg1(this, arg_1);
2318  call_VM_helper(oop_result, entry_point, 1, check_exceptions);
2319  ret(0);
2320
2321  bind(E);
2322}
2323
2324void MacroAssembler::call_VM(Register oop_result,
2325                             address entry_point,
2326                             Register arg_1,
2327                             Register arg_2,
2328                             bool check_exceptions) {
2329  Label C, E;
2330  call(C, relocInfo::none);
2331  jmp(E);
2332
2333  bind(C);
2334
2335  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2336
2337  pass_arg2(this, arg_2);
2338  pass_arg1(this, arg_1);
2339  call_VM_helper(oop_result, entry_point, 2, check_exceptions);
2340  ret(0);
2341
2342  bind(E);
2343}
2344
2345void MacroAssembler::call_VM(Register oop_result,
2346                             address entry_point,
2347                             Register arg_1,
2348                             Register arg_2,
2349                             Register arg_3,
2350                             bool check_exceptions) {
2351  Label C, E;
2352  call(C, relocInfo::none);
2353  jmp(E);
2354
2355  bind(C);
2356
2357  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2358  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2359  pass_arg3(this, arg_3);
2360
2361  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2362  pass_arg2(this, arg_2);
2363
2364  pass_arg1(this, arg_1);
2365  call_VM_helper(oop_result, entry_point, 3, check_exceptions);
2366  ret(0);
2367
2368  bind(E);
2369}
2370
2371void MacroAssembler::call_VM(Register oop_result,
2372                             Register last_java_sp,
2373                             address entry_point,
2374                             int number_of_arguments,
2375                             bool check_exceptions) {
2376  Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2377  call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2378}
2379
2380void MacroAssembler::call_VM(Register oop_result,
2381                             Register last_java_sp,
2382                             address entry_point,
2383                             Register arg_1,
2384                             bool check_exceptions) {
2385  pass_arg1(this, arg_1);
2386  call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2387}
2388
2389void MacroAssembler::call_VM(Register oop_result,
2390                             Register last_java_sp,
2391                             address entry_point,
2392                             Register arg_1,
2393                             Register arg_2,
2394                             bool check_exceptions) {
2395
2396  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2397  pass_arg2(this, arg_2);
2398  pass_arg1(this, arg_1);
2399  call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2400}
2401
2402void MacroAssembler::call_VM(Register oop_result,
2403                             Register last_java_sp,
2404                             address entry_point,
2405                             Register arg_1,
2406                             Register arg_2,
2407                             Register arg_3,
2408                             bool check_exceptions) {
2409  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2410  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2411  pass_arg3(this, arg_3);
2412  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2413  pass_arg2(this, arg_2);
2414  pass_arg1(this, arg_1);
2415  call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2416}
2417
2418void MacroAssembler::super_call_VM(Register oop_result,
2419                                   Register last_java_sp,
2420                                   address entry_point,
2421                                   int number_of_arguments,
2422                                   bool check_exceptions) {
2423  Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2424  MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2425}
2426
2427void MacroAssembler::super_call_VM(Register oop_result,
2428                                   Register last_java_sp,
2429                                   address entry_point,
2430                                   Register arg_1,
2431                                   bool check_exceptions) {
2432  pass_arg1(this, arg_1);
2433  super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2434}
2435
2436void MacroAssembler::super_call_VM(Register oop_result,
2437                                   Register last_java_sp,
2438                                   address entry_point,
2439                                   Register arg_1,
2440                                   Register arg_2,
2441                                   bool check_exceptions) {
2442
2443  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2444  pass_arg2(this, arg_2);
2445  pass_arg1(this, arg_1);
2446  super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2447}
2448
2449void MacroAssembler::super_call_VM(Register oop_result,
2450                                   Register last_java_sp,
2451                                   address entry_point,
2452                                   Register arg_1,
2453                                   Register arg_2,
2454                                   Register arg_3,
2455                                   bool check_exceptions) {
2456  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2457  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2458  pass_arg3(this, arg_3);
2459  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2460  pass_arg2(this, arg_2);
2461  pass_arg1(this, arg_1);
2462  super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2463}
2464
2465void MacroAssembler::call_VM_base(Register oop_result,
2466                                  Register java_thread,
2467                                  Register last_java_sp,
2468                                  address  entry_point,
2469                                  int      number_of_arguments,
2470                                  bool     check_exceptions) {
2471  // determine java_thread register
2472  if (!java_thread->is_valid()) {
2473#ifdef _LP64
2474    java_thread = r15_thread;
2475#else
2476    java_thread = rdi;
2477    get_thread(java_thread);
2478#endif // LP64
2479  }
2480  // determine last_java_sp register
2481  if (!last_java_sp->is_valid()) {
2482    last_java_sp = rsp;
2483  }
2484  // debugging support
2485  assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
2486  LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2487#ifdef ASSERT
2488  // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2489  // r12 is the heapbase.
2490  LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2491#endif // ASSERT
2492
2493  assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
2494  assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2495
2496  // push java thread (becomes first argument of C function)
2497
2498  NOT_LP64(push(java_thread); number_of_arguments++);
2499  LP64_ONLY(mov(c_rarg0, r15_thread));
2500
2501  // set last Java frame before call
2502  assert(last_java_sp != rbp, "can't use ebp/rbp");
2503
2504  // Only interpreter should have to set fp
2505  set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2506
2507  // do the call, remove parameters
2508  MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2509
2510  // restore the thread (cannot use the pushed argument since arguments
2511  // may be overwritten by C code generated by an optimizing compiler);
2512  // however can use the register value directly if it is callee saved.
2513  if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2514    // rdi & rsi (also r15) are callee saved -> nothing to do
2515#ifdef ASSERT
2516    guarantee(java_thread != rax, "change this code");
2517    push(rax);
2518    { Label L;
2519      get_thread(rax);
2520      cmpptr(java_thread, rax);
2521      jcc(Assembler::equal, L);
2522      STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2523      bind(L);
2524    }
2525    pop(rax);
2526#endif
2527  } else {
2528    get_thread(java_thread);
2529  }
2530  // reset last Java frame
2531  // Only interpreter should have to clear fp
2532  reset_last_Java_frame(java_thread, true);
2533
2534   // C++ interp handles this in the interpreter
2535  check_and_handle_popframe(java_thread);
2536  check_and_handle_earlyret(java_thread);
2537
2538  if (check_exceptions) {
2539    // check for pending exceptions (java_thread is set upon return)
2540    cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2541#ifndef _LP64
2542    jump_cc(Assembler::notEqual,
2543            RuntimeAddress(StubRoutines::forward_exception_entry()));
2544#else
2545    // This used to conditionally jump to forward_exception however it is
2546    // possible if we relocate that the branch will not reach. So we must jump
2547    // around so we can always reach
2548
2549    Label ok;
2550    jcc(Assembler::equal, ok);
2551    jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2552    bind(ok);
2553#endif // LP64
2554  }
2555
2556  // get oop result if there is one and reset the value in the thread
2557  if (oop_result->is_valid()) {
2558    get_vm_result(oop_result, java_thread);
2559  }
2560}
2561
2562void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2563
2564  // Calculate the value for last_Java_sp
2565  // somewhat subtle. call_VM does an intermediate call
2566  // which places a return address on the stack just under the
2567  // stack pointer as the user finsihed with it. This allows
2568  // use to retrieve last_Java_pc from last_Java_sp[-1].
2569  // On 32bit we then have to push additional args on the stack to accomplish
2570  // the actual requested call. On 64bit call_VM only can use register args
2571  // so the only extra space is the return address that call_VM created.
2572  // This hopefully explains the calculations here.
2573
2574#ifdef _LP64
2575  // We've pushed one address, correct last_Java_sp
2576  lea(rax, Address(rsp, wordSize));
2577#else
2578  lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2579#endif // LP64
2580
2581  call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2582
2583}
2584
2585// Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
2586void MacroAssembler::call_VM_leaf0(address entry_point) {
2587  MacroAssembler::call_VM_leaf_base(entry_point, 0);
2588}
2589
2590void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2591  call_VM_leaf_base(entry_point, number_of_arguments);
2592}
2593
2594void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2595  pass_arg0(this, arg_0);
2596  call_VM_leaf(entry_point, 1);
2597}
2598
2599void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2600
2601  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2602  pass_arg1(this, arg_1);
2603  pass_arg0(this, arg_0);
2604  call_VM_leaf(entry_point, 2);
2605}
2606
2607void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2608  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2609  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2610  pass_arg2(this, arg_2);
2611  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2612  pass_arg1(this, arg_1);
2613  pass_arg0(this, arg_0);
2614  call_VM_leaf(entry_point, 3);
2615}
2616
2617void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2618  pass_arg0(this, arg_0);
2619  MacroAssembler::call_VM_leaf_base(entry_point, 1);
2620}
2621
2622void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2623
2624  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2625  pass_arg1(this, arg_1);
2626  pass_arg0(this, arg_0);
2627  MacroAssembler::call_VM_leaf_base(entry_point, 2);
2628}
2629
2630void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2631  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2632  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2633  pass_arg2(this, arg_2);
2634  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2635  pass_arg1(this, arg_1);
2636  pass_arg0(this, arg_0);
2637  MacroAssembler::call_VM_leaf_base(entry_point, 3);
2638}
2639
2640void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2641  LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2642  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2643  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2644  pass_arg3(this, arg_3);
2645  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2646  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2647  pass_arg2(this, arg_2);
2648  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2649  pass_arg1(this, arg_1);
2650  pass_arg0(this, arg_0);
2651  MacroAssembler::call_VM_leaf_base(entry_point, 4);
2652}
2653
2654void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2655  movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2656  movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2657  verify_oop(oop_result, "broken oop in call_VM_base");
2658}
2659
2660void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2661  movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2662  movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2663}
2664
2665void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2666}
2667
2668void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2669}
2670
2671void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2672  if (reachable(src1)) {
2673    cmpl(as_Address(src1), imm);
2674  } else {
2675    lea(rscratch1, src1);
2676    cmpl(Address(rscratch1, 0), imm);
2677  }
2678}
2679
2680void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2681  assert(!src2.is_lval(), "use cmpptr");
2682  if (reachable(src2)) {
2683    cmpl(src1, as_Address(src2));
2684  } else {
2685    lea(rscratch1, src2);
2686    cmpl(src1, Address(rscratch1, 0));
2687  }
2688}
2689
2690void MacroAssembler::cmp32(Register src1, int32_t imm) {
2691  Assembler::cmpl(src1, imm);
2692}
2693
2694void MacroAssembler::cmp32(Register src1, Address src2) {
2695  Assembler::cmpl(src1, src2);
2696}
2697
2698void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2699  ucomisd(opr1, opr2);
2700
2701  Label L;
2702  if (unordered_is_less) {
2703    movl(dst, -1);
2704    jcc(Assembler::parity, L);
2705    jcc(Assembler::below , L);
2706    movl(dst, 0);
2707    jcc(Assembler::equal , L);
2708    increment(dst);
2709  } else { // unordered is greater
2710    movl(dst, 1);
2711    jcc(Assembler::parity, L);
2712    jcc(Assembler::above , L);
2713    movl(dst, 0);
2714    jcc(Assembler::equal , L);
2715    decrementl(dst);
2716  }
2717  bind(L);
2718}
2719
2720void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2721  ucomiss(opr1, opr2);
2722
2723  Label L;
2724  if (unordered_is_less) {
2725    movl(dst, -1);
2726    jcc(Assembler::parity, L);
2727    jcc(Assembler::below , L);
2728    movl(dst, 0);
2729    jcc(Assembler::equal , L);
2730    increment(dst);
2731  } else { // unordered is greater
2732    movl(dst, 1);
2733    jcc(Assembler::parity, L);
2734    jcc(Assembler::above , L);
2735    movl(dst, 0);
2736    jcc(Assembler::equal , L);
2737    decrementl(dst);
2738  }
2739  bind(L);
2740}
2741
2742
2743void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2744  if (reachable(src1)) {
2745    cmpb(as_Address(src1), imm);
2746  } else {
2747    lea(rscratch1, src1);
2748    cmpb(Address(rscratch1, 0), imm);
2749  }
2750}
2751
2752void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2753#ifdef _LP64
2754  if (src2.is_lval()) {
2755    movptr(rscratch1, src2);
2756    Assembler::cmpq(src1, rscratch1);
2757  } else if (reachable(src2)) {
2758    cmpq(src1, as_Address(src2));
2759  } else {
2760    lea(rscratch1, src2);
2761    Assembler::cmpq(src1, Address(rscratch1, 0));
2762  }
2763#else
2764  if (src2.is_lval()) {
2765    cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2766  } else {
2767    cmpl(src1, as_Address(src2));
2768  }
2769#endif // _LP64
2770}
2771
2772void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2773  assert(src2.is_lval(), "not a mem-mem compare");
2774#ifdef _LP64
2775  // moves src2's literal address
2776  movptr(rscratch1, src2);
2777  Assembler::cmpq(src1, rscratch1);
2778#else
2779  cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2780#endif // _LP64
2781}
2782
2783void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2784  if (reachable(adr)) {
2785    if (os::is_MP())
2786      lock();
2787    cmpxchgptr(reg, as_Address(adr));
2788  } else {
2789    lea(rscratch1, adr);
2790    if (os::is_MP())
2791      lock();
2792    cmpxchgptr(reg, Address(rscratch1, 0));
2793  }
2794}
2795
2796void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2797  LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2798}
2799
2800void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2801  if (reachable(src)) {
2802    Assembler::comisd(dst, as_Address(src));
2803  } else {
2804    lea(rscratch1, src);
2805    Assembler::comisd(dst, Address(rscratch1, 0));
2806  }
2807}
2808
2809void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2810  if (reachable(src)) {
2811    Assembler::comiss(dst, as_Address(src));
2812  } else {
2813    lea(rscratch1, src);
2814    Assembler::comiss(dst, Address(rscratch1, 0));
2815  }
2816}
2817
2818
2819void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2820  Condition negated_cond = negate_condition(cond);
2821  Label L;
2822  jcc(negated_cond, L);
2823  pushf(); // Preserve flags
2824  atomic_incl(counter_addr);
2825  popf();
2826  bind(L);
2827}
2828
2829int MacroAssembler::corrected_idivl(Register reg) {
2830  // Full implementation of Java idiv and irem; checks for
2831  // special case as described in JVM spec., p.243 & p.271.
2832  // The function returns the (pc) offset of the idivl
2833  // instruction - may be needed for implicit exceptions.
2834  //
2835  //         normal case                           special case
2836  //
2837  // input : rax,: dividend                         min_int
2838  //         reg: divisor   (may not be rax,/rdx)   -1
2839  //
2840  // output: rax,: quotient  (= rax, idiv reg)       min_int
2841  //         rdx: remainder (= rax, irem reg)       0
2842  assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2843  const int min_int = 0x80000000;
2844  Label normal_case, special_case;
2845
2846  // check for special case
2847  cmpl(rax, min_int);
2848  jcc(Assembler::notEqual, normal_case);
2849  xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2850  cmpl(reg, -1);
2851  jcc(Assembler::equal, special_case);
2852
2853  // handle normal case
2854  bind(normal_case);
2855  cdql();
2856  int idivl_offset = offset();
2857  idivl(reg);
2858
2859  // normal and special case exit
2860  bind(special_case);
2861
2862  return idivl_offset;
2863}
2864
2865
2866
2867void MacroAssembler::decrementl(Register reg, int value) {
2868  if (value == min_jint) {subl(reg, value) ; return; }
2869  if (value <  0) { incrementl(reg, -value); return; }
2870  if (value == 0) {                        ; return; }
2871  if (value == 1 && UseIncDec) { decl(reg) ; return; }
2872  /* else */      { subl(reg, value)       ; return; }
2873}
2874
2875void MacroAssembler::decrementl(Address dst, int value) {
2876  if (value == min_jint) {subl(dst, value) ; return; }
2877  if (value <  0) { incrementl(dst, -value); return; }
2878  if (value == 0) {                        ; return; }
2879  if (value == 1 && UseIncDec) { decl(dst) ; return; }
2880  /* else */      { subl(dst, value)       ; return; }
2881}
2882
2883void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2884  assert (shift_value > 0, "illegal shift value");
2885  Label _is_positive;
2886  testl (reg, reg);
2887  jcc (Assembler::positive, _is_positive);
2888  int offset = (1 << shift_value) - 1 ;
2889
2890  if (offset == 1) {
2891    incrementl(reg);
2892  } else {
2893    addl(reg, offset);
2894  }
2895
2896  bind (_is_positive);
2897  sarl(reg, shift_value);
2898}
2899
2900void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2901  if (reachable(src)) {
2902    Assembler::divsd(dst, as_Address(src));
2903  } else {
2904    lea(rscratch1, src);
2905    Assembler::divsd(dst, Address(rscratch1, 0));
2906  }
2907}
2908
2909void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2910  if (reachable(src)) {
2911    Assembler::divss(dst, as_Address(src));
2912  } else {
2913    lea(rscratch1, src);
2914    Assembler::divss(dst, Address(rscratch1, 0));
2915  }
2916}
2917
2918// !defined(COMPILER2) is because of stupid core builds
2919#if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2) || INCLUDE_JVMCI
2920void MacroAssembler::empty_FPU_stack() {
2921  if (VM_Version::supports_mmx()) {
2922    emms();
2923  } else {
2924    for (int i = 8; i-- > 0; ) ffree(i);
2925  }
2926}
2927#endif // !LP64 || C1 || !C2 || INCLUDE_JVMCI
2928
2929
2930// Defines obj, preserves var_size_in_bytes
2931void MacroAssembler::eden_allocate(Register obj,
2932                                   Register var_size_in_bytes,
2933                                   int con_size_in_bytes,
2934                                   Register t1,
2935                                   Label& slow_case) {
2936  assert(obj == rax, "obj must be in rax, for cmpxchg");
2937  assert_different_registers(obj, var_size_in_bytes, t1);
2938  if (!Universe::heap()->supports_inline_contig_alloc()) {
2939    jmp(slow_case);
2940  } else {
2941    Register end = t1;
2942    Label retry;
2943    bind(retry);
2944    ExternalAddress heap_top((address) Universe::heap()->top_addr());
2945    movptr(obj, heap_top);
2946    if (var_size_in_bytes == noreg) {
2947      lea(end, Address(obj, con_size_in_bytes));
2948    } else {
2949      lea(end, Address(obj, var_size_in_bytes, Address::times_1));
2950    }
2951    // if end < obj then we wrapped around => object too long => slow case
2952    cmpptr(end, obj);
2953    jcc(Assembler::below, slow_case);
2954    cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
2955    jcc(Assembler::above, slow_case);
2956    // Compare obj with the top addr, and if still equal, store the new top addr in
2957    // end at the address of the top addr pointer. Sets ZF if was equal, and clears
2958    // it otherwise. Use lock prefix for atomicity on MPs.
2959    locked_cmpxchgptr(end, heap_top);
2960    jcc(Assembler::notEqual, retry);
2961  }
2962}
2963
2964void MacroAssembler::enter() {
2965  push(rbp);
2966  mov(rbp, rsp);
2967}
2968
2969// A 5 byte nop that is safe for patching (see patch_verified_entry)
2970void MacroAssembler::fat_nop() {
2971  if (UseAddressNop) {
2972    addr_nop_5();
2973  } else {
2974    emit_int8(0x26); // es:
2975    emit_int8(0x2e); // cs:
2976    emit_int8(0x64); // fs:
2977    emit_int8(0x65); // gs:
2978    emit_int8((unsigned char)0x90);
2979  }
2980}
2981
2982void MacroAssembler::fcmp(Register tmp) {
2983  fcmp(tmp, 1, true, true);
2984}
2985
2986void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2987  assert(!pop_right || pop_left, "usage error");
2988  if (VM_Version::supports_cmov()) {
2989    assert(tmp == noreg, "unneeded temp");
2990    if (pop_left) {
2991      fucomip(index);
2992    } else {
2993      fucomi(index);
2994    }
2995    if (pop_right) {
2996      fpop();
2997    }
2998  } else {
2999    assert(tmp != noreg, "need temp");
3000    if (pop_left) {
3001      if (pop_right) {
3002        fcompp();
3003      } else {
3004        fcomp(index);
3005      }
3006    } else {
3007      fcom(index);
3008    }
3009    // convert FPU condition into eflags condition via rax,
3010    save_rax(tmp);
3011    fwait(); fnstsw_ax();
3012    sahf();
3013    restore_rax(tmp);
3014  }
3015  // condition codes set as follows:
3016  //
3017  // CF (corresponds to C0) if x < y
3018  // PF (corresponds to C2) if unordered
3019  // ZF (corresponds to C3) if x = y
3020}
3021
3022void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
3023  fcmp2int(dst, unordered_is_less, 1, true, true);
3024}
3025
3026void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
3027  fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
3028  Label L;
3029  if (unordered_is_less) {
3030    movl(dst, -1);
3031    jcc(Assembler::parity, L);
3032    jcc(Assembler::below , L);
3033    movl(dst, 0);
3034    jcc(Assembler::equal , L);
3035    increment(dst);
3036  } else { // unordered is greater
3037    movl(dst, 1);
3038    jcc(Assembler::parity, L);
3039    jcc(Assembler::above , L);
3040    movl(dst, 0);
3041    jcc(Assembler::equal , L);
3042    decrementl(dst);
3043  }
3044  bind(L);
3045}
3046
3047void MacroAssembler::fld_d(AddressLiteral src) {
3048  fld_d(as_Address(src));
3049}
3050
3051void MacroAssembler::fld_s(AddressLiteral src) {
3052  fld_s(as_Address(src));
3053}
3054
3055void MacroAssembler::fld_x(AddressLiteral src) {
3056  Assembler::fld_x(as_Address(src));
3057}
3058
3059void MacroAssembler::fldcw(AddressLiteral src) {
3060  Assembler::fldcw(as_Address(src));
3061}
3062
3063void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
3064  if (reachable(src)) {
3065    Assembler::mulpd(dst, as_Address(src));
3066  } else {
3067    lea(rscratch1, src);
3068    Assembler::mulpd(dst, Address(rscratch1, 0));
3069  }
3070}
3071
3072void MacroAssembler::increase_precision() {
3073  subptr(rsp, BytesPerWord);
3074  fnstcw(Address(rsp, 0));
3075  movl(rax, Address(rsp, 0));
3076  orl(rax, 0x300);
3077  push(rax);
3078  fldcw(Address(rsp, 0));
3079  pop(rax);
3080}
3081
3082void MacroAssembler::restore_precision() {
3083  fldcw(Address(rsp, 0));
3084  addptr(rsp, BytesPerWord);
3085}
3086
3087void MacroAssembler::fpop() {
3088  ffree();
3089  fincstp();
3090}
3091
3092void MacroAssembler::load_float(Address src) {
3093  if (UseSSE >= 1) {
3094    movflt(xmm0, src);
3095  } else {
3096    LP64_ONLY(ShouldNotReachHere());
3097    NOT_LP64(fld_s(src));
3098  }
3099}
3100
3101void MacroAssembler::store_float(Address dst) {
3102  if (UseSSE >= 1) {
3103    movflt(dst, xmm0);
3104  } else {
3105    LP64_ONLY(ShouldNotReachHere());
3106    NOT_LP64(fstp_s(dst));
3107  }
3108}
3109
3110void MacroAssembler::load_double(Address src) {
3111  if (UseSSE >= 2) {
3112    movdbl(xmm0, src);
3113  } else {
3114    LP64_ONLY(ShouldNotReachHere());
3115    NOT_LP64(fld_d(src));
3116  }
3117}
3118
3119void MacroAssembler::store_double(Address dst) {
3120  if (UseSSE >= 2) {
3121    movdbl(dst, xmm0);
3122  } else {
3123    LP64_ONLY(ShouldNotReachHere());
3124    NOT_LP64(fstp_d(dst));
3125  }
3126}
3127
3128void MacroAssembler::fremr(Register tmp) {
3129  save_rax(tmp);
3130  { Label L;
3131    bind(L);
3132    fprem();
3133    fwait(); fnstsw_ax();
3134#ifdef _LP64
3135    testl(rax, 0x400);
3136    jcc(Assembler::notEqual, L);
3137#else
3138    sahf();
3139    jcc(Assembler::parity, L);
3140#endif // _LP64
3141  }
3142  restore_rax(tmp);
3143  // Result is in ST0.
3144  // Note: fxch & fpop to get rid of ST1
3145  // (otherwise FPU stack could overflow eventually)
3146  fxch(1);
3147  fpop();
3148}
3149
3150// dst = c = a * b + c
3151void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
3152  Assembler::vfmadd231sd(c, a, b);
3153  if (dst != c) {
3154    movdbl(dst, c);
3155  }
3156}
3157
3158// dst = c = a * b + c
3159void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
3160  Assembler::vfmadd231ss(c, a, b);
3161  if (dst != c) {
3162    movflt(dst, c);
3163  }
3164}
3165
3166
3167
3168
3169void MacroAssembler::incrementl(AddressLiteral dst) {
3170  if (reachable(dst)) {
3171    incrementl(as_Address(dst));
3172  } else {
3173    lea(rscratch1, dst);
3174    incrementl(Address(rscratch1, 0));
3175  }
3176}
3177
3178void MacroAssembler::incrementl(ArrayAddress dst) {
3179  incrementl(as_Address(dst));
3180}
3181
3182void MacroAssembler::incrementl(Register reg, int value) {
3183  if (value == min_jint) {addl(reg, value) ; return; }
3184  if (value <  0) { decrementl(reg, -value); return; }
3185  if (value == 0) {                        ; return; }
3186  if (value == 1 && UseIncDec) { incl(reg) ; return; }
3187  /* else */      { addl(reg, value)       ; return; }
3188}
3189
3190void MacroAssembler::incrementl(Address dst, int value) {
3191  if (value == min_jint) {addl(dst, value) ; return; }
3192  if (value <  0) { decrementl(dst, -value); return; }
3193  if (value == 0) {                        ; return; }
3194  if (value == 1 && UseIncDec) { incl(dst) ; return; }
3195  /* else */      { addl(dst, value)       ; return; }
3196}
3197
3198void MacroAssembler::jump(AddressLiteral dst) {
3199  if (reachable(dst)) {
3200    jmp_literal(dst.target(), dst.rspec());
3201  } else {
3202    lea(rscratch1, dst);
3203    jmp(rscratch1);
3204  }
3205}
3206
3207void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3208  if (reachable(dst)) {
3209    InstructionMark im(this);
3210    relocate(dst.reloc());
3211    const int short_size = 2;
3212    const int long_size = 6;
3213    int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3214    if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3215      // 0111 tttn #8-bit disp
3216      emit_int8(0x70 | cc);
3217      emit_int8((offs - short_size) & 0xFF);
3218    } else {
3219      // 0000 1111 1000 tttn #32-bit disp
3220      emit_int8(0x0F);
3221      emit_int8((unsigned char)(0x80 | cc));
3222      emit_int32(offs - long_size);
3223    }
3224  } else {
3225#ifdef ASSERT
3226    warning("reversing conditional branch");
3227#endif /* ASSERT */
3228    Label skip;
3229    jccb(reverse[cc], skip);
3230    lea(rscratch1, dst);
3231    Assembler::jmp(rscratch1);
3232    bind(skip);
3233  }
3234}
3235
3236void MacroAssembler::ldmxcsr(AddressLiteral src) {
3237  if (reachable(src)) {
3238    Assembler::ldmxcsr(as_Address(src));
3239  } else {
3240    lea(rscratch1, src);
3241    Assembler::ldmxcsr(Address(rscratch1, 0));
3242  }
3243}
3244
3245int MacroAssembler::load_signed_byte(Register dst, Address src) {
3246  int off;
3247  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3248    off = offset();
3249    movsbl(dst, src); // movsxb
3250  } else {
3251    off = load_unsigned_byte(dst, src);
3252    shll(dst, 24);
3253    sarl(dst, 24);
3254  }
3255  return off;
3256}
3257
3258// Note: load_signed_short used to be called load_signed_word.
3259// Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3260// manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3261// The term "word" in HotSpot means a 32- or 64-bit machine word.
3262int MacroAssembler::load_signed_short(Register dst, Address src) {
3263  int off;
3264  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3265    // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3266    // version but this is what 64bit has always done. This seems to imply
3267    // that users are only using 32bits worth.
3268    off = offset();
3269    movswl(dst, src); // movsxw
3270  } else {
3271    off = load_unsigned_short(dst, src);
3272    shll(dst, 16);
3273    sarl(dst, 16);
3274  }
3275  return off;
3276}
3277
3278int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3279  // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3280  // and "3.9 Partial Register Penalties", p. 22).
3281  int off;
3282  if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3283    off = offset();
3284    movzbl(dst, src); // movzxb
3285  } else {
3286    xorl(dst, dst);
3287    off = offset();
3288    movb(dst, src);
3289  }
3290  return off;
3291}
3292
3293// Note: load_unsigned_short used to be called load_unsigned_word.
3294int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3295  // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3296  // and "3.9 Partial Register Penalties", p. 22).
3297  int off;
3298  if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3299    off = offset();
3300    movzwl(dst, src); // movzxw
3301  } else {
3302    xorl(dst, dst);
3303    off = offset();
3304    movw(dst, src);
3305  }
3306  return off;
3307}
3308
3309void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3310  switch (size_in_bytes) {
3311#ifndef _LP64
3312  case  8:
3313    assert(dst2 != noreg, "second dest register required");
3314    movl(dst,  src);
3315    movl(dst2, src.plus_disp(BytesPerInt));
3316    break;
3317#else
3318  case  8:  movq(dst, src); break;
3319#endif
3320  case  4:  movl(dst, src); break;
3321  case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3322  case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3323  default:  ShouldNotReachHere();
3324  }
3325}
3326
3327void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3328  switch (size_in_bytes) {
3329#ifndef _LP64
3330  case  8:
3331    assert(src2 != noreg, "second source register required");
3332    movl(dst,                        src);
3333    movl(dst.plus_disp(BytesPerInt), src2);
3334    break;
3335#else
3336  case  8:  movq(dst, src); break;
3337#endif
3338  case  4:  movl(dst, src); break;
3339  case  2:  movw(dst, src); break;
3340  case  1:  movb(dst, src); break;
3341  default:  ShouldNotReachHere();
3342  }
3343}
3344
3345void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3346  if (reachable(dst)) {
3347    movl(as_Address(dst), src);
3348  } else {
3349    lea(rscratch1, dst);
3350    movl(Address(rscratch1, 0), src);
3351  }
3352}
3353
3354void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3355  if (reachable(src)) {
3356    movl(dst, as_Address(src));
3357  } else {
3358    lea(rscratch1, src);
3359    movl(dst, Address(rscratch1, 0));
3360  }
3361}
3362
3363// C++ bool manipulation
3364
3365void MacroAssembler::movbool(Register dst, Address src) {
3366  if(sizeof(bool) == 1)
3367    movb(dst, src);
3368  else if(sizeof(bool) == 2)
3369    movw(dst, src);
3370  else if(sizeof(bool) == 4)
3371    movl(dst, src);
3372  else
3373    // unsupported
3374    ShouldNotReachHere();
3375}
3376
3377void MacroAssembler::movbool(Address dst, bool boolconst) {
3378  if(sizeof(bool) == 1)
3379    movb(dst, (int) boolconst);
3380  else if(sizeof(bool) == 2)
3381    movw(dst, (int) boolconst);
3382  else if(sizeof(bool) == 4)
3383    movl(dst, (int) boolconst);
3384  else
3385    // unsupported
3386    ShouldNotReachHere();
3387}
3388
3389void MacroAssembler::movbool(Address dst, Register src) {
3390  if(sizeof(bool) == 1)
3391    movb(dst, src);
3392  else if(sizeof(bool) == 2)
3393    movw(dst, src);
3394  else if(sizeof(bool) == 4)
3395    movl(dst, src);
3396  else
3397    // unsupported
3398    ShouldNotReachHere();
3399}
3400
3401void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3402  movb(as_Address(dst), src);
3403}
3404
3405void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3406  if (reachable(src)) {
3407    movdl(dst, as_Address(src));
3408  } else {
3409    lea(rscratch1, src);
3410    movdl(dst, Address(rscratch1, 0));
3411  }
3412}
3413
3414void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3415  if (reachable(src)) {
3416    movq(dst, as_Address(src));
3417  } else {
3418    lea(rscratch1, src);
3419    movq(dst, Address(rscratch1, 0));
3420  }
3421}
3422
3423void MacroAssembler::setvectmask(Register dst, Register src) {
3424  Assembler::movl(dst, 1);
3425  Assembler::shlxl(dst, dst, src);
3426  Assembler::decl(dst);
3427  Assembler::kmovdl(k1, dst);
3428  Assembler::movl(dst, src);
3429}
3430
3431void MacroAssembler::restorevectmask() {
3432  Assembler::knotwl(k1, k0);
3433}
3434
3435void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3436  if (reachable(src)) {
3437    if (UseXmmLoadAndClearUpper) {
3438      movsd (dst, as_Address(src));
3439    } else {
3440      movlpd(dst, as_Address(src));
3441    }
3442  } else {
3443    lea(rscratch1, src);
3444    if (UseXmmLoadAndClearUpper) {
3445      movsd (dst, Address(rscratch1, 0));
3446    } else {
3447      movlpd(dst, Address(rscratch1, 0));
3448    }
3449  }
3450}
3451
3452void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3453  if (reachable(src)) {
3454    movss(dst, as_Address(src));
3455  } else {
3456    lea(rscratch1, src);
3457    movss(dst, Address(rscratch1, 0));
3458  }
3459}
3460
3461void MacroAssembler::movptr(Register dst, Register src) {
3462  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3463}
3464
3465void MacroAssembler::movptr(Register dst, Address src) {
3466  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3467}
3468
3469// src should NEVER be a real pointer. Use AddressLiteral for true pointers
3470void MacroAssembler::movptr(Register dst, intptr_t src) {
3471  LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3472}
3473
3474void MacroAssembler::movptr(Address dst, Register src) {
3475  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3476}
3477
3478void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3479  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3480    Assembler::vextractf32x4(dst, src, 0);
3481  } else {
3482    Assembler::movdqu(dst, src);
3483  }
3484}
3485
3486void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3487  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3488    Assembler::vinsertf32x4(dst, dst, src, 0);
3489  } else {
3490    Assembler::movdqu(dst, src);
3491  }
3492}
3493
3494void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3495  if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3496    Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3497  } else {
3498    Assembler::movdqu(dst, src);
3499  }
3500}
3501
3502void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
3503  if (reachable(src)) {
3504    movdqu(dst, as_Address(src));
3505  } else {
3506    lea(scratchReg, src);
3507    movdqu(dst, Address(scratchReg, 0));
3508  }
3509}
3510
3511void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3512  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3513    vextractf64x4_low(dst, src);
3514  } else {
3515    Assembler::vmovdqu(dst, src);
3516  }
3517}
3518
3519void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3520  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3521    vinsertf64x4_low(dst, src);
3522  } else {
3523    Assembler::vmovdqu(dst, src);
3524  }
3525}
3526
3527void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3528  if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3529    Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3530  }
3531  else {
3532    Assembler::vmovdqu(dst, src);
3533  }
3534}
3535
3536void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
3537  if (reachable(src)) {
3538    vmovdqu(dst, as_Address(src));
3539  }
3540  else {
3541    lea(rscratch1, src);
3542    vmovdqu(dst, Address(rscratch1, 0));
3543  }
3544}
3545
3546void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3547  if (reachable(src)) {
3548    Assembler::movdqa(dst, as_Address(src));
3549  } else {
3550    lea(rscratch1, src);
3551    Assembler::movdqa(dst, Address(rscratch1, 0));
3552  }
3553}
3554
3555void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3556  if (reachable(src)) {
3557    Assembler::movsd(dst, as_Address(src));
3558  } else {
3559    lea(rscratch1, src);
3560    Assembler::movsd(dst, Address(rscratch1, 0));
3561  }
3562}
3563
3564void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3565  if (reachable(src)) {
3566    Assembler::movss(dst, as_Address(src));
3567  } else {
3568    lea(rscratch1, src);
3569    Assembler::movss(dst, Address(rscratch1, 0));
3570  }
3571}
3572
3573void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3574  if (reachable(src)) {
3575    Assembler::mulsd(dst, as_Address(src));
3576  } else {
3577    lea(rscratch1, src);
3578    Assembler::mulsd(dst, Address(rscratch1, 0));
3579  }
3580}
3581
3582void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3583  if (reachable(src)) {
3584    Assembler::mulss(dst, as_Address(src));
3585  } else {
3586    lea(rscratch1, src);
3587    Assembler::mulss(dst, Address(rscratch1, 0));
3588  }
3589}
3590
3591void MacroAssembler::null_check(Register reg, int offset) {
3592  if (needs_explicit_null_check(offset)) {
3593    // provoke OS NULL exception if reg = NULL by
3594    // accessing M[reg] w/o changing any (non-CC) registers
3595    // NOTE: cmpl is plenty here to provoke a segv
3596    cmpptr(rax, Address(reg, 0));
3597    // Note: should probably use testl(rax, Address(reg, 0));
3598    //       may be shorter code (however, this version of
3599    //       testl needs to be implemented first)
3600  } else {
3601    // nothing to do, (later) access of M[reg + offset]
3602    // will provoke OS NULL exception if reg = NULL
3603  }
3604}
3605
3606void MacroAssembler::os_breakpoint() {
3607  // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3608  // (e.g., MSVC can't call ps() otherwise)
3609  call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3610}
3611
3612#ifdef _LP64
3613#define XSTATE_BV 0x200
3614#endif
3615
3616void MacroAssembler::pop_CPU_state() {
3617  pop_FPU_state();
3618  pop_IU_state();
3619}
3620
3621void MacroAssembler::pop_FPU_state() {
3622#ifndef _LP64
3623  frstor(Address(rsp, 0));
3624#else
3625  fxrstor(Address(rsp, 0));
3626#endif
3627  addptr(rsp, FPUStateSizeInWords * wordSize);
3628}
3629
3630void MacroAssembler::pop_IU_state() {
3631  popa();
3632  LP64_ONLY(addq(rsp, 8));
3633  popf();
3634}
3635
3636// Save Integer and Float state
3637// Warning: Stack must be 16 byte aligned (64bit)
3638void MacroAssembler::push_CPU_state() {
3639  push_IU_state();
3640  push_FPU_state();
3641}
3642
3643void MacroAssembler::push_FPU_state() {
3644  subptr(rsp, FPUStateSizeInWords * wordSize);
3645#ifndef _LP64
3646  fnsave(Address(rsp, 0));
3647  fwait();
3648#else
3649  fxsave(Address(rsp, 0));
3650#endif // LP64
3651}
3652
3653void MacroAssembler::push_IU_state() {
3654  // Push flags first because pusha kills them
3655  pushf();
3656  // Make sure rsp stays 16-byte aligned
3657  LP64_ONLY(subq(rsp, 8));
3658  pusha();
3659}
3660
3661void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
3662  if (!java_thread->is_valid()) {
3663    java_thread = rdi;
3664    get_thread(java_thread);
3665  }
3666  // we must set sp to zero to clear frame
3667  movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3668  if (clear_fp) {
3669    movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3670  }
3671
3672  // Always clear the pc because it could have been set by make_walkable()
3673  movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3674
3675}
3676
3677void MacroAssembler::restore_rax(Register tmp) {
3678  if (tmp == noreg) pop(rax);
3679  else if (tmp != rax) mov(rax, tmp);
3680}
3681
3682void MacroAssembler::round_to(Register reg, int modulus) {
3683  addptr(reg, modulus - 1);
3684  andptr(reg, -modulus);
3685}
3686
3687void MacroAssembler::save_rax(Register tmp) {
3688  if (tmp == noreg) push(rax);
3689  else if (tmp != rax) mov(tmp, rax);
3690}
3691
3692// Write serialization page so VM thread can do a pseudo remote membar.
3693// We use the current thread pointer to calculate a thread specific
3694// offset to write to within the page. This minimizes bus traffic
3695// due to cache line collision.
3696void MacroAssembler::serialize_memory(Register thread, Register tmp) {
3697  movl(tmp, thread);
3698  shrl(tmp, os::get_serialize_page_shift_count());
3699  andl(tmp, (os::vm_page_size() - sizeof(int)));
3700
3701  Address index(noreg, tmp, Address::times_1);
3702  ExternalAddress page(os::get_memory_serialize_page());
3703
3704  // Size of store must match masking code above
3705  movl(as_Address(ArrayAddress(page, index)), tmp);
3706}
3707
3708// Calls to C land
3709//
3710// When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3711// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3712// has to be reset to 0. This is required to allow proper stack traversal.
3713void MacroAssembler::set_last_Java_frame(Register java_thread,
3714                                         Register last_java_sp,
3715                                         Register last_java_fp,
3716                                         address  last_java_pc) {
3717  // determine java_thread register
3718  if (!java_thread->is_valid()) {
3719    java_thread = rdi;
3720    get_thread(java_thread);
3721  }
3722  // determine last_java_sp register
3723  if (!last_java_sp->is_valid()) {
3724    last_java_sp = rsp;
3725  }
3726
3727  // last_java_fp is optional
3728
3729  if (last_java_fp->is_valid()) {
3730    movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3731  }
3732
3733  // last_java_pc is optional
3734
3735  if (last_java_pc != NULL) {
3736    lea(Address(java_thread,
3737                 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3738        InternalAddress(last_java_pc));
3739
3740  }
3741  movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3742}
3743
3744void MacroAssembler::shlptr(Register dst, int imm8) {
3745  LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3746}
3747
3748void MacroAssembler::shrptr(Register dst, int imm8) {
3749  LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3750}
3751
3752void MacroAssembler::sign_extend_byte(Register reg) {
3753  if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3754    movsbl(reg, reg); // movsxb
3755  } else {
3756    shll(reg, 24);
3757    sarl(reg, 24);
3758  }
3759}
3760
3761void MacroAssembler::sign_extend_short(Register reg) {
3762  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3763    movswl(reg, reg); // movsxw
3764  } else {
3765    shll(reg, 16);
3766    sarl(reg, 16);
3767  }
3768}
3769
3770void MacroAssembler::testl(Register dst, AddressLiteral src) {
3771  assert(reachable(src), "Address should be reachable");
3772  testl(dst, as_Address(src));
3773}
3774
3775void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3776  int dst_enc = dst->encoding();
3777  int src_enc = src->encoding();
3778  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3779    Assembler::pcmpeqb(dst, src);
3780  } else if ((dst_enc < 16) && (src_enc < 16)) {
3781    Assembler::pcmpeqb(dst, src);
3782  } else if (src_enc < 16) {
3783    subptr(rsp, 64);
3784    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3785    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3786    Assembler::pcmpeqb(xmm0, src);
3787    movdqu(dst, xmm0);
3788    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3789    addptr(rsp, 64);
3790  } else if (dst_enc < 16) {
3791    subptr(rsp, 64);
3792    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3793    evmovdqul(xmm0, src, Assembler::AVX_512bit);
3794    Assembler::pcmpeqb(dst, xmm0);
3795    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3796    addptr(rsp, 64);
3797  } else {
3798    subptr(rsp, 64);
3799    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3800    subptr(rsp, 64);
3801    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
3802    movdqu(xmm0, src);
3803    movdqu(xmm1, dst);
3804    Assembler::pcmpeqb(xmm1, xmm0);
3805    movdqu(dst, xmm1);
3806    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
3807    addptr(rsp, 64);
3808    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3809    addptr(rsp, 64);
3810  }
3811}
3812
3813void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3814  int dst_enc = dst->encoding();
3815  int src_enc = src->encoding();
3816  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3817    Assembler::pcmpeqw(dst, src);
3818  } else if ((dst_enc < 16) && (src_enc < 16)) {
3819    Assembler::pcmpeqw(dst, src);
3820  } else if (src_enc < 16) {
3821    subptr(rsp, 64);
3822    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3823    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3824    Assembler::pcmpeqw(xmm0, src);
3825    movdqu(dst, xmm0);
3826    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3827    addptr(rsp, 64);
3828  } else if (dst_enc < 16) {
3829    subptr(rsp, 64);
3830    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3831    evmovdqul(xmm0, src, Assembler::AVX_512bit);
3832    Assembler::pcmpeqw(dst, xmm0);
3833    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3834    addptr(rsp, 64);
3835  } else {
3836    subptr(rsp, 64);
3837    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3838    subptr(rsp, 64);
3839    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
3840    movdqu(xmm0, src);
3841    movdqu(xmm1, dst);
3842    Assembler::pcmpeqw(xmm1, xmm0);
3843    movdqu(dst, xmm1);
3844    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
3845    addptr(rsp, 64);
3846    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3847    addptr(rsp, 64);
3848  }
3849}
3850
3851void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3852  int dst_enc = dst->encoding();
3853  if (dst_enc < 16) {
3854    Assembler::pcmpestri(dst, src, imm8);
3855  } else {
3856    subptr(rsp, 64);
3857    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3858    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3859    Assembler::pcmpestri(xmm0, src, imm8);
3860    movdqu(dst, xmm0);
3861    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3862    addptr(rsp, 64);
3863  }
3864}
3865
3866void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3867  int dst_enc = dst->encoding();
3868  int src_enc = src->encoding();
3869  if ((dst_enc < 16) && (src_enc < 16)) {
3870    Assembler::pcmpestri(dst, src, imm8);
3871  } else if (src_enc < 16) {
3872    subptr(rsp, 64);
3873    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3874    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3875    Assembler::pcmpestri(xmm0, src, imm8);
3876    movdqu(dst, xmm0);
3877    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3878    addptr(rsp, 64);
3879  } else if (dst_enc < 16) {
3880    subptr(rsp, 64);
3881    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3882    evmovdqul(xmm0, src, Assembler::AVX_512bit);
3883    Assembler::pcmpestri(dst, xmm0, imm8);
3884    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3885    addptr(rsp, 64);
3886  } else {
3887    subptr(rsp, 64);
3888    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3889    subptr(rsp, 64);
3890    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
3891    movdqu(xmm0, src);
3892    movdqu(xmm1, dst);
3893    Assembler::pcmpestri(xmm1, xmm0, imm8);
3894    movdqu(dst, xmm1);
3895    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
3896    addptr(rsp, 64);
3897    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3898    addptr(rsp, 64);
3899  }
3900}
3901
3902void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3903  int dst_enc = dst->encoding();
3904  int src_enc = src->encoding();
3905  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3906    Assembler::pmovzxbw(dst, src);
3907  } else if ((dst_enc < 16) && (src_enc < 16)) {
3908    Assembler::pmovzxbw(dst, src);
3909  } else if (src_enc < 16) {
3910    subptr(rsp, 64);
3911    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3912    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3913    Assembler::pmovzxbw(xmm0, src);
3914    movdqu(dst, xmm0);
3915    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3916    addptr(rsp, 64);
3917  } else if (dst_enc < 16) {
3918    subptr(rsp, 64);
3919    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3920    evmovdqul(xmm0, src, Assembler::AVX_512bit);
3921    Assembler::pmovzxbw(dst, xmm0);
3922    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3923    addptr(rsp, 64);
3924  } else {
3925    subptr(rsp, 64);
3926    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3927    subptr(rsp, 64);
3928    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
3929    movdqu(xmm0, src);
3930    movdqu(xmm1, dst);
3931    Assembler::pmovzxbw(xmm1, xmm0);
3932    movdqu(dst, xmm1);
3933    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
3934    addptr(rsp, 64);
3935    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3936    addptr(rsp, 64);
3937  }
3938}
3939
3940void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3941  int dst_enc = dst->encoding();
3942  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3943    Assembler::pmovzxbw(dst, src);
3944  } else if (dst_enc < 16) {
3945    Assembler::pmovzxbw(dst, src);
3946  } else {
3947    subptr(rsp, 64);
3948    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3949    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3950    Assembler::pmovzxbw(xmm0, src);
3951    movdqu(dst, xmm0);
3952    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3953    addptr(rsp, 64);
3954  }
3955}
3956
3957void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3958  int src_enc = src->encoding();
3959  if (src_enc < 16) {
3960    Assembler::pmovmskb(dst, src);
3961  } else {
3962    subptr(rsp, 64);
3963    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3964    evmovdqul(xmm0, src, Assembler::AVX_512bit);
3965    Assembler::pmovmskb(dst, xmm0);
3966    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3967    addptr(rsp, 64);
3968  }
3969}
3970
3971void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3972  int dst_enc = dst->encoding();
3973  int src_enc = src->encoding();
3974  if ((dst_enc < 16) && (src_enc < 16)) {
3975    Assembler::ptest(dst, src);
3976  } else if (src_enc < 16) {
3977    subptr(rsp, 64);
3978    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3979    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3980    Assembler::ptest(xmm0, src);
3981    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3982    addptr(rsp, 64);
3983  } else if (dst_enc < 16) {
3984    subptr(rsp, 64);
3985    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3986    evmovdqul(xmm0, src, Assembler::AVX_512bit);
3987    Assembler::ptest(dst, xmm0);
3988    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3989    addptr(rsp, 64);
3990  } else {
3991    subptr(rsp, 64);
3992    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3993    subptr(rsp, 64);
3994    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
3995    movdqu(xmm0, src);
3996    movdqu(xmm1, dst);
3997    Assembler::ptest(xmm1, xmm0);
3998    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
3999    addptr(rsp, 64);
4000    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4001    addptr(rsp, 64);
4002  }
4003}
4004
4005void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
4006  if (reachable(src)) {
4007    Assembler::sqrtsd(dst, as_Address(src));
4008  } else {
4009    lea(rscratch1, src);
4010    Assembler::sqrtsd(dst, Address(rscratch1, 0));
4011  }
4012}
4013
4014void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
4015  if (reachable(src)) {
4016    Assembler::sqrtss(dst, as_Address(src));
4017  } else {
4018    lea(rscratch1, src);
4019    Assembler::sqrtss(dst, Address(rscratch1, 0));
4020  }
4021}
4022
4023void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
4024  if (reachable(src)) {
4025    Assembler::subsd(dst, as_Address(src));
4026  } else {
4027    lea(rscratch1, src);
4028    Assembler::subsd(dst, Address(rscratch1, 0));
4029  }
4030}
4031
4032void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
4033  if (reachable(src)) {
4034    Assembler::subss(dst, as_Address(src));
4035  } else {
4036    lea(rscratch1, src);
4037    Assembler::subss(dst, Address(rscratch1, 0));
4038  }
4039}
4040
4041void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
4042  if (reachable(src)) {
4043    Assembler::ucomisd(dst, as_Address(src));
4044  } else {
4045    lea(rscratch1, src);
4046    Assembler::ucomisd(dst, Address(rscratch1, 0));
4047  }
4048}
4049
4050void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
4051  if (reachable(src)) {
4052    Assembler::ucomiss(dst, as_Address(src));
4053  } else {
4054    lea(rscratch1, src);
4055    Assembler::ucomiss(dst, Address(rscratch1, 0));
4056  }
4057}
4058
4059void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
4060  // Used in sign-bit flipping with aligned address.
4061  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
4062  if (reachable(src)) {
4063    Assembler::xorpd(dst, as_Address(src));
4064  } else {
4065    lea(rscratch1, src);
4066    Assembler::xorpd(dst, Address(rscratch1, 0));
4067  }
4068}
4069
4070void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
4071  if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
4072    Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
4073  }
4074  else {
4075    Assembler::xorpd(dst, src);
4076  }
4077}
4078
4079void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
4080  if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
4081    Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
4082  } else {
4083    Assembler::xorps(dst, src);
4084  }
4085}
4086
4087void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
4088  // Used in sign-bit flipping with aligned address.
4089  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
4090  if (reachable(src)) {
4091    Assembler::xorps(dst, as_Address(src));
4092  } else {
4093    lea(rscratch1, src);
4094    Assembler::xorps(dst, Address(rscratch1, 0));
4095  }
4096}
4097
4098void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
4099  // Used in sign-bit flipping with aligned address.
4100  bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
4101  assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
4102  if (reachable(src)) {
4103    Assembler::pshufb(dst, as_Address(src));
4104  } else {
4105    lea(rscratch1, src);
4106    Assembler::pshufb(dst, Address(rscratch1, 0));
4107  }
4108}
4109
4110// AVX 3-operands instructions
4111
4112void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4113  if (reachable(src)) {
4114    vaddsd(dst, nds, as_Address(src));
4115  } else {
4116    lea(rscratch1, src);
4117    vaddsd(dst, nds, Address(rscratch1, 0));
4118  }
4119}
4120
4121void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4122  if (reachable(src)) {
4123    vaddss(dst, nds, as_Address(src));
4124  } else {
4125    lea(rscratch1, src);
4126    vaddss(dst, nds, Address(rscratch1, 0));
4127  }
4128}
4129
4130void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
4131  int dst_enc = dst->encoding();
4132  int nds_enc = nds->encoding();
4133  int src_enc = src->encoding();
4134  if ((dst_enc < 16) && (nds_enc < 16)) {
4135    vandps(dst, nds, negate_field, vector_len);
4136  } else if ((src_enc < 16) && (dst_enc < 16)) {
4137    evmovdqul(src, nds, Assembler::AVX_512bit);
4138    vandps(dst, src, negate_field, vector_len);
4139  } else if (src_enc < 16) {
4140    evmovdqul(src, nds, Assembler::AVX_512bit);
4141    vandps(src, src, negate_field, vector_len);
4142    evmovdqul(dst, src, Assembler::AVX_512bit);
4143  } else if (dst_enc < 16) {
4144    evmovdqul(src, xmm0, Assembler::AVX_512bit);
4145    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4146    vandps(dst, xmm0, negate_field, vector_len);
4147    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4148  } else {
4149    if (src_enc != dst_enc) {
4150      evmovdqul(src, xmm0, Assembler::AVX_512bit);
4151      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4152      vandps(xmm0, xmm0, negate_field, vector_len);
4153      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4154      evmovdqul(xmm0, src, Assembler::AVX_512bit);
4155    } else {
4156      subptr(rsp, 64);
4157      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4158      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4159      vandps(xmm0, xmm0, negate_field, vector_len);
4160      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4161      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4162      addptr(rsp, 64);
4163    }
4164  }
4165}
4166
4167void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
4168  int dst_enc = dst->encoding();
4169  int nds_enc = nds->encoding();
4170  int src_enc = src->encoding();
4171  if ((dst_enc < 16) && (nds_enc < 16)) {
4172    vandpd(dst, nds, negate_field, vector_len);
4173  } else if ((src_enc < 16) && (dst_enc < 16)) {
4174    evmovdqul(src, nds, Assembler::AVX_512bit);
4175    vandpd(dst, src, negate_field, vector_len);
4176  } else if (src_enc < 16) {
4177    evmovdqul(src, nds, Assembler::AVX_512bit);
4178    vandpd(src, src, negate_field, vector_len);
4179    evmovdqul(dst, src, Assembler::AVX_512bit);
4180  } else if (dst_enc < 16) {
4181    evmovdqul(src, xmm0, Assembler::AVX_512bit);
4182    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4183    vandpd(dst, xmm0, negate_field, vector_len);
4184    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4185  } else {
4186    if (src_enc != dst_enc) {
4187      evmovdqul(src, xmm0, Assembler::AVX_512bit);
4188      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4189      vandpd(xmm0, xmm0, negate_field, vector_len);
4190      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4191      evmovdqul(xmm0, src, Assembler::AVX_512bit);
4192    } else {
4193      subptr(rsp, 64);
4194      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4195      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4196      vandpd(xmm0, xmm0, negate_field, vector_len);
4197      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4198      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4199      addptr(rsp, 64);
4200    }
4201  }
4202}
4203
4204void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4205  int dst_enc = dst->encoding();
4206  int nds_enc = nds->encoding();
4207  int src_enc = src->encoding();
4208  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4209    Assembler::vpaddb(dst, nds, src, vector_len);
4210  } else if ((dst_enc < 16) && (src_enc < 16)) {
4211    Assembler::vpaddb(dst, dst, src, vector_len);
4212  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4213    // use nds as scratch for src
4214    evmovdqul(nds, src, Assembler::AVX_512bit);
4215    Assembler::vpaddb(dst, dst, nds, vector_len);
4216  } else if ((src_enc < 16) && (nds_enc < 16)) {
4217    // use nds as scratch for dst
4218    evmovdqul(nds, dst, Assembler::AVX_512bit);
4219    Assembler::vpaddb(nds, nds, src, vector_len);
4220    evmovdqul(dst, nds, Assembler::AVX_512bit);
4221  } else if (dst_enc < 16) {
4222    // use nds as scatch for xmm0 to hold src
4223    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4224    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4225    Assembler::vpaddb(dst, dst, xmm0, vector_len);
4226    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4227  } else {
4228    // worse case scenario, all regs are in the upper bank
4229    subptr(rsp, 64);
4230    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4231    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4232    evmovdqul(xmm1, src, Assembler::AVX_512bit);
4233    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4234    Assembler::vpaddb(xmm0, xmm0, xmm1, vector_len);
4235    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4236    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4237    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4238    addptr(rsp, 64);
4239  }
4240}
4241
4242void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4243  int dst_enc = dst->encoding();
4244  int nds_enc = nds->encoding();
4245  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4246    Assembler::vpaddb(dst, nds, src, vector_len);
4247  } else if (dst_enc < 16) {
4248    Assembler::vpaddb(dst, dst, src, vector_len);
4249  } else if (nds_enc < 16) {
4250    // implies dst_enc in upper bank with src as scratch
4251    evmovdqul(nds, dst, Assembler::AVX_512bit);
4252    Assembler::vpaddb(nds, nds, src, vector_len);
4253    evmovdqul(dst, nds, Assembler::AVX_512bit);
4254  } else {
4255    // worse case scenario, all regs in upper bank
4256    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4257    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4258    Assembler::vpaddb(xmm0, xmm0, src, vector_len);
4259    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4260  }
4261}
4262
4263void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4264  int dst_enc = dst->encoding();
4265  int nds_enc = nds->encoding();
4266  int src_enc = src->encoding();
4267  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4268    Assembler::vpaddw(dst, nds, src, vector_len);
4269  } else if ((dst_enc < 16) && (src_enc < 16)) {
4270    Assembler::vpaddw(dst, dst, src, vector_len);
4271  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4272    // use nds as scratch for src
4273    evmovdqul(nds, src, Assembler::AVX_512bit);
4274    Assembler::vpaddw(dst, dst, nds, vector_len);
4275  } else if ((src_enc < 16) && (nds_enc < 16)) {
4276    // use nds as scratch for dst
4277    evmovdqul(nds, dst, Assembler::AVX_512bit);
4278    Assembler::vpaddw(nds, nds, src, vector_len);
4279    evmovdqul(dst, nds, Assembler::AVX_512bit);
4280  } else if (dst_enc < 16) {
4281    // use nds as scatch for xmm0 to hold src
4282    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4283    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4284    Assembler::vpaddw(dst, dst, xmm0, vector_len);
4285    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4286  } else {
4287    // worse case scenario, all regs are in the upper bank
4288    subptr(rsp, 64);
4289    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4290    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4291    evmovdqul(xmm1, src, Assembler::AVX_512bit);
4292    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4293    Assembler::vpaddw(xmm0, xmm0, xmm1, vector_len);
4294    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4295    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4296    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4297    addptr(rsp, 64);
4298  }
4299}
4300
4301void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4302  int dst_enc = dst->encoding();
4303  int nds_enc = nds->encoding();
4304  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4305    Assembler::vpaddw(dst, nds, src, vector_len);
4306  } else if (dst_enc < 16) {
4307    Assembler::vpaddw(dst, dst, src, vector_len);
4308  } else if (nds_enc < 16) {
4309    // implies dst_enc in upper bank with src as scratch
4310    evmovdqul(nds, dst, Assembler::AVX_512bit);
4311    Assembler::vpaddw(nds, nds, src, vector_len);
4312    evmovdqul(dst, nds, Assembler::AVX_512bit);
4313  } else {
4314    // worse case scenario, all regs in upper bank
4315    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4316    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4317    Assembler::vpaddw(xmm0, xmm0, src, vector_len);
4318    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4319  }
4320}
4321
4322void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4323  if (reachable(src)) {
4324    Assembler::vpand(dst, nds, as_Address(src), vector_len);
4325  } else {
4326    lea(rscratch1, src);
4327    Assembler::vpand(dst, nds, Address(rscratch1, 0), vector_len);
4328  }
4329}
4330
4331void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
4332  int dst_enc = dst->encoding();
4333  int src_enc = src->encoding();
4334  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4335    Assembler::vpbroadcastw(dst, src);
4336  } else if ((dst_enc < 16) && (src_enc < 16)) {
4337    Assembler::vpbroadcastw(dst, src);
4338  } else if (src_enc < 16) {
4339    subptr(rsp, 64);
4340    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4341    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4342    Assembler::vpbroadcastw(xmm0, src);
4343    movdqu(dst, xmm0);
4344    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4345    addptr(rsp, 64);
4346  } else if (dst_enc < 16) {
4347    subptr(rsp, 64);
4348    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4349    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4350    Assembler::vpbroadcastw(dst, xmm0);
4351    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4352    addptr(rsp, 64);
4353  } else {
4354    subptr(rsp, 64);
4355    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4356    subptr(rsp, 64);
4357    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4358    movdqu(xmm0, src);
4359    movdqu(xmm1, dst);
4360    Assembler::vpbroadcastw(xmm1, xmm0);
4361    movdqu(dst, xmm1);
4362    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4363    addptr(rsp, 64);
4364    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4365    addptr(rsp, 64);
4366  }
4367}
4368
4369void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4370  int dst_enc = dst->encoding();
4371  int nds_enc = nds->encoding();
4372  int src_enc = src->encoding();
4373  assert(dst_enc == nds_enc, "");
4374  if ((dst_enc < 16) && (src_enc < 16)) {
4375    Assembler::vpcmpeqb(dst, nds, src, vector_len);
4376  } else if (src_enc < 16) {
4377    subptr(rsp, 64);
4378    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4379    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4380    Assembler::vpcmpeqb(xmm0, xmm0, src, vector_len);
4381    movdqu(dst, xmm0);
4382    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4383    addptr(rsp, 64);
4384  } else if (dst_enc < 16) {
4385    subptr(rsp, 64);
4386    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4387    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4388    Assembler::vpcmpeqb(dst, dst, xmm0, vector_len);
4389    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4390    addptr(rsp, 64);
4391  } else {
4392    subptr(rsp, 64);
4393    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4394    subptr(rsp, 64);
4395    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4396    movdqu(xmm0, src);
4397    movdqu(xmm1, dst);
4398    Assembler::vpcmpeqb(xmm1, xmm1, xmm0, vector_len);
4399    movdqu(dst, xmm1);
4400    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4401    addptr(rsp, 64);
4402    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4403    addptr(rsp, 64);
4404  }
4405}
4406
4407void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4408  int dst_enc = dst->encoding();
4409  int nds_enc = nds->encoding();
4410  int src_enc = src->encoding();
4411  assert(dst_enc == nds_enc, "");
4412  if ((dst_enc < 16) && (src_enc < 16)) {
4413    Assembler::vpcmpeqw(dst, nds, src, vector_len);
4414  } else if (src_enc < 16) {
4415    subptr(rsp, 64);
4416    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4417    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4418    Assembler::vpcmpeqw(xmm0, xmm0, src, vector_len);
4419    movdqu(dst, xmm0);
4420    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4421    addptr(rsp, 64);
4422  } else if (dst_enc < 16) {
4423    subptr(rsp, 64);
4424    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4425    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4426    Assembler::vpcmpeqw(dst, dst, xmm0, vector_len);
4427    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4428    addptr(rsp, 64);
4429  } else {
4430    subptr(rsp, 64);
4431    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4432    subptr(rsp, 64);
4433    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4434    movdqu(xmm0, src);
4435    movdqu(xmm1, dst);
4436    Assembler::vpcmpeqw(xmm1, xmm1, xmm0, vector_len);
4437    movdqu(dst, xmm1);
4438    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4439    addptr(rsp, 64);
4440    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4441    addptr(rsp, 64);
4442  }
4443}
4444
4445void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
4446  int dst_enc = dst->encoding();
4447  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4448    Assembler::vpmovzxbw(dst, src, vector_len);
4449  } else if (dst_enc < 16) {
4450    Assembler::vpmovzxbw(dst, src, vector_len);
4451  } else {
4452    subptr(rsp, 64);
4453    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4454    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4455    Assembler::vpmovzxbw(xmm0, src, vector_len);
4456    movdqu(dst, xmm0);
4457    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4458    addptr(rsp, 64);
4459  }
4460}
4461
4462void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
4463  int src_enc = src->encoding();
4464  if (src_enc < 16) {
4465    Assembler::vpmovmskb(dst, src);
4466  } else {
4467    subptr(rsp, 64);
4468    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4469    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4470    Assembler::vpmovmskb(dst, xmm0);
4471    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4472    addptr(rsp, 64);
4473  }
4474}
4475
4476void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4477  int dst_enc = dst->encoding();
4478  int nds_enc = nds->encoding();
4479  int src_enc = src->encoding();
4480  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4481    Assembler::vpmullw(dst, nds, src, vector_len);
4482  } else if ((dst_enc < 16) && (src_enc < 16)) {
4483    Assembler::vpmullw(dst, dst, src, vector_len);
4484  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4485    // use nds as scratch for src
4486    evmovdqul(nds, src, Assembler::AVX_512bit);
4487    Assembler::vpmullw(dst, dst, nds, vector_len);
4488  } else if ((src_enc < 16) && (nds_enc < 16)) {
4489    // use nds as scratch for dst
4490    evmovdqul(nds, dst, Assembler::AVX_512bit);
4491    Assembler::vpmullw(nds, nds, src, vector_len);
4492    evmovdqul(dst, nds, Assembler::AVX_512bit);
4493  } else if (dst_enc < 16) {
4494    // use nds as scatch for xmm0 to hold src
4495    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4496    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4497    Assembler::vpmullw(dst, dst, xmm0, vector_len);
4498    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4499  } else {
4500    // worse case scenario, all regs are in the upper bank
4501    subptr(rsp, 64);
4502    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4503    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4504    evmovdqul(xmm1, src, Assembler::AVX_512bit);
4505    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4506    Assembler::vpmullw(xmm0, xmm0, xmm1, vector_len);
4507    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4508    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4509    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4510    addptr(rsp, 64);
4511  }
4512}
4513
4514void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4515  int dst_enc = dst->encoding();
4516  int nds_enc = nds->encoding();
4517  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4518    Assembler::vpmullw(dst, nds, src, vector_len);
4519  } else if (dst_enc < 16) {
4520    Assembler::vpmullw(dst, dst, src, vector_len);
4521  } else if (nds_enc < 16) {
4522    // implies dst_enc in upper bank with src as scratch
4523    evmovdqul(nds, dst, Assembler::AVX_512bit);
4524    Assembler::vpmullw(nds, nds, src, vector_len);
4525    evmovdqul(dst, nds, Assembler::AVX_512bit);
4526  } else {
4527    // worse case scenario, all regs in upper bank
4528    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4529    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4530    Assembler::vpmullw(xmm0, xmm0, src, vector_len);
4531    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4532  }
4533}
4534
4535void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4536  int dst_enc = dst->encoding();
4537  int nds_enc = nds->encoding();
4538  int src_enc = src->encoding();
4539  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4540    Assembler::vpsubb(dst, nds, src, vector_len);
4541  } else if ((dst_enc < 16) && (src_enc < 16)) {
4542    Assembler::vpsubb(dst, dst, src, vector_len);
4543  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4544    // use nds as scratch for src
4545    evmovdqul(nds, src, Assembler::AVX_512bit);
4546    Assembler::vpsubb(dst, dst, nds, vector_len);
4547  } else if ((src_enc < 16) && (nds_enc < 16)) {
4548    // use nds as scratch for dst
4549    evmovdqul(nds, dst, Assembler::AVX_512bit);
4550    Assembler::vpsubb(nds, nds, src, vector_len);
4551    evmovdqul(dst, nds, Assembler::AVX_512bit);
4552  } else if (dst_enc < 16) {
4553    // use nds as scatch for xmm0 to hold src
4554    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4555    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4556    Assembler::vpsubb(dst, dst, xmm0, vector_len);
4557    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4558  } else {
4559    // worse case scenario, all regs are in the upper bank
4560    subptr(rsp, 64);
4561    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4562    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4563    evmovdqul(xmm1, src, Assembler::AVX_512bit);
4564    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4565    Assembler::vpsubb(xmm0, xmm0, xmm1, vector_len);
4566    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4567    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4568    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4569    addptr(rsp, 64);
4570  }
4571}
4572
4573void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4574  int dst_enc = dst->encoding();
4575  int nds_enc = nds->encoding();
4576  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4577    Assembler::vpsubb(dst, nds, src, vector_len);
4578  } else if (dst_enc < 16) {
4579    Assembler::vpsubb(dst, dst, src, vector_len);
4580  } else if (nds_enc < 16) {
4581    // implies dst_enc in upper bank with src as scratch
4582    evmovdqul(nds, dst, Assembler::AVX_512bit);
4583    Assembler::vpsubb(nds, nds, src, vector_len);
4584    evmovdqul(dst, nds, Assembler::AVX_512bit);
4585  } else {
4586    // worse case scenario, all regs in upper bank
4587    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4588    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4589    Assembler::vpsubw(xmm0, xmm0, src, vector_len);
4590    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4591  }
4592}
4593
4594void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4595  int dst_enc = dst->encoding();
4596  int nds_enc = nds->encoding();
4597  int src_enc = src->encoding();
4598  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4599    Assembler::vpsubw(dst, nds, src, vector_len);
4600  } else if ((dst_enc < 16) && (src_enc < 16)) {
4601    Assembler::vpsubw(dst, dst, src, vector_len);
4602  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4603    // use nds as scratch for src
4604    evmovdqul(nds, src, Assembler::AVX_512bit);
4605    Assembler::vpsubw(dst, dst, nds, vector_len);
4606  } else if ((src_enc < 16) && (nds_enc < 16)) {
4607    // use nds as scratch for dst
4608    evmovdqul(nds, dst, Assembler::AVX_512bit);
4609    Assembler::vpsubw(nds, nds, src, vector_len);
4610    evmovdqul(dst, nds, Assembler::AVX_512bit);
4611  } else if (dst_enc < 16) {
4612    // use nds as scatch for xmm0 to hold src
4613    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4614    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4615    Assembler::vpsubw(dst, dst, xmm0, vector_len);
4616    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4617  } else {
4618    // worse case scenario, all regs are in the upper bank
4619    subptr(rsp, 64);
4620    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4621    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4622    evmovdqul(xmm1, src, Assembler::AVX_512bit);
4623    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4624    Assembler::vpsubw(xmm0, xmm0, xmm1, vector_len);
4625    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4626    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4627    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4628    addptr(rsp, 64);
4629  }
4630}
4631
4632void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4633  int dst_enc = dst->encoding();
4634  int nds_enc = nds->encoding();
4635  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4636    Assembler::vpsubw(dst, nds, src, vector_len);
4637  } else if (dst_enc < 16) {
4638    Assembler::vpsubw(dst, dst, src, vector_len);
4639  } else if (nds_enc < 16) {
4640    // implies dst_enc in upper bank with src as scratch
4641    evmovdqul(nds, dst, Assembler::AVX_512bit);
4642    Assembler::vpsubw(nds, nds, src, vector_len);
4643    evmovdqul(dst, nds, Assembler::AVX_512bit);
4644  } else {
4645    // worse case scenario, all regs in upper bank
4646    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4647    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4648    Assembler::vpsubw(xmm0, xmm0, src, vector_len);
4649    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4650  }
4651}
4652
4653void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4654  int dst_enc = dst->encoding();
4655  int nds_enc = nds->encoding();
4656  int shift_enc = shift->encoding();
4657  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4658    Assembler::vpsraw(dst, nds, shift, vector_len);
4659  } else if ((dst_enc < 16) && (shift_enc < 16)) {
4660    Assembler::vpsraw(dst, dst, shift, vector_len);
4661  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4662    // use nds_enc as scratch with shift
4663    evmovdqul(nds, shift, Assembler::AVX_512bit);
4664    Assembler::vpsraw(dst, dst, nds, vector_len);
4665  } else if ((shift_enc < 16) && (nds_enc < 16)) {
4666    // use nds as scratch with dst
4667    evmovdqul(nds, dst, Assembler::AVX_512bit);
4668    Assembler::vpsraw(nds, nds, shift, vector_len);
4669    evmovdqul(dst, nds, Assembler::AVX_512bit);
4670  } else if (dst_enc < 16) {
4671    // use nds to save a copy of xmm0 and hold shift
4672    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4673    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4674    Assembler::vpsraw(dst, dst, xmm0, vector_len);
4675    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4676  } else if (nds_enc < 16) {
4677    // use nds as dest as temps
4678    evmovdqul(nds, dst, Assembler::AVX_512bit);
4679    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4680    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4681    Assembler::vpsraw(nds, nds, xmm0, vector_len);
4682    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4683    evmovdqul(dst, nds, Assembler::AVX_512bit);
4684  } else {
4685    // worse case scenario, all regs are in the upper bank
4686    subptr(rsp, 64);
4687    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4688    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4689    evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4690    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4691    Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4692    evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4693    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4694    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4695    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4696    addptr(rsp, 64);
4697  }
4698}
4699
4700void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4701  int dst_enc = dst->encoding();
4702  int nds_enc = nds->encoding();
4703  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4704    Assembler::vpsraw(dst, nds, shift, vector_len);
4705  } else if (dst_enc < 16) {
4706    Assembler::vpsraw(dst, dst, shift, vector_len);
4707  } else if (nds_enc < 16) {
4708    // use nds as scratch
4709    evmovdqul(nds, dst, Assembler::AVX_512bit);
4710    Assembler::vpsraw(nds, nds, shift, vector_len);
4711    evmovdqul(dst, nds, Assembler::AVX_512bit);
4712  } else {
4713    // use nds as scratch for xmm0
4714    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4715    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4716    Assembler::vpsraw(xmm0, xmm0, shift, vector_len);
4717    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4718  }
4719}
4720
4721void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4722  int dst_enc = dst->encoding();
4723  int nds_enc = nds->encoding();
4724  int shift_enc = shift->encoding();
4725  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4726    Assembler::vpsrlw(dst, nds, shift, vector_len);
4727  } else if ((dst_enc < 16) && (shift_enc < 16)) {
4728    Assembler::vpsrlw(dst, dst, shift, vector_len);
4729  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4730    // use nds_enc as scratch with shift
4731    evmovdqul(nds, shift, Assembler::AVX_512bit);
4732    Assembler::vpsrlw(dst, dst, nds, vector_len);
4733  } else if ((shift_enc < 16) && (nds_enc < 16)) {
4734    // use nds as scratch with dst
4735    evmovdqul(nds, dst, Assembler::AVX_512bit);
4736    Assembler::vpsrlw(nds, nds, shift, vector_len);
4737    evmovdqul(dst, nds, Assembler::AVX_512bit);
4738  } else if (dst_enc < 16) {
4739    // use nds to save a copy of xmm0 and hold shift
4740    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4741    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4742    Assembler::vpsrlw(dst, dst, xmm0, vector_len);
4743    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4744  } else if (nds_enc < 16) {
4745    // use nds as dest as temps
4746    evmovdqul(nds, dst, Assembler::AVX_512bit);
4747    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4748    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4749    Assembler::vpsrlw(nds, nds, xmm0, vector_len);
4750    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4751    evmovdqul(dst, nds, Assembler::AVX_512bit);
4752  } else {
4753    // worse case scenario, all regs are in the upper bank
4754    subptr(rsp, 64);
4755    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4756    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4757    evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4758    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4759    Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4760    evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4761    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4762    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4763    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4764    addptr(rsp, 64);
4765  }
4766}
4767
4768void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4769  int dst_enc = dst->encoding();
4770  int nds_enc = nds->encoding();
4771  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4772    Assembler::vpsrlw(dst, nds, shift, vector_len);
4773  } else if (dst_enc < 16) {
4774    Assembler::vpsrlw(dst, dst, shift, vector_len);
4775  } else if (nds_enc < 16) {
4776    // use nds as scratch
4777    evmovdqul(nds, dst, Assembler::AVX_512bit);
4778    Assembler::vpsrlw(nds, nds, shift, vector_len);
4779    evmovdqul(dst, nds, Assembler::AVX_512bit);
4780  } else {
4781    // use nds as scratch for xmm0
4782    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4783    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4784    Assembler::vpsrlw(xmm0, xmm0, shift, vector_len);
4785    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4786  }
4787}
4788
4789void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4790  int dst_enc = dst->encoding();
4791  int nds_enc = nds->encoding();
4792  int shift_enc = shift->encoding();
4793  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4794    Assembler::vpsllw(dst, nds, shift, vector_len);
4795  } else if ((dst_enc < 16) && (shift_enc < 16)) {
4796    Assembler::vpsllw(dst, dst, shift, vector_len);
4797  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4798    // use nds_enc as scratch with shift
4799    evmovdqul(nds, shift, Assembler::AVX_512bit);
4800    Assembler::vpsllw(dst, dst, nds, vector_len);
4801  } else if ((shift_enc < 16) && (nds_enc < 16)) {
4802    // use nds as scratch with dst
4803    evmovdqul(nds, dst, Assembler::AVX_512bit);
4804    Assembler::vpsllw(nds, nds, shift, vector_len);
4805    evmovdqul(dst, nds, Assembler::AVX_512bit);
4806  } else if (dst_enc < 16) {
4807    // use nds to save a copy of xmm0 and hold shift
4808    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4809    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4810    Assembler::vpsllw(dst, dst, xmm0, vector_len);
4811    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4812  } else if (nds_enc < 16) {
4813    // use nds as dest as temps
4814    evmovdqul(nds, dst, Assembler::AVX_512bit);
4815    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4816    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4817    Assembler::vpsllw(nds, nds, xmm0, vector_len);
4818    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4819    evmovdqul(dst, nds, Assembler::AVX_512bit);
4820  } else {
4821    // worse case scenario, all regs are in the upper bank
4822    subptr(rsp, 64);
4823    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4824    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4825    evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4826    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4827    Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4828    evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4829    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4830    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4831    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4832    addptr(rsp, 64);
4833  }
4834}
4835
4836void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4837  int dst_enc = dst->encoding();
4838  int nds_enc = nds->encoding();
4839  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4840    Assembler::vpsllw(dst, nds, shift, vector_len);
4841  } else if (dst_enc < 16) {
4842    Assembler::vpsllw(dst, dst, shift, vector_len);
4843  } else if (nds_enc < 16) {
4844    // use nds as scratch
4845    evmovdqul(nds, dst, Assembler::AVX_512bit);
4846    Assembler::vpsllw(nds, nds, shift, vector_len);
4847    evmovdqul(dst, nds, Assembler::AVX_512bit);
4848  } else {
4849    // use nds as scratch for xmm0
4850    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4851    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4852    Assembler::vpsllw(xmm0, xmm0, shift, vector_len);
4853    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4854  }
4855}
4856
4857void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
4858  int dst_enc = dst->encoding();
4859  int src_enc = src->encoding();
4860  if ((dst_enc < 16) && (src_enc < 16)) {
4861    Assembler::vptest(dst, src);
4862  } else if (src_enc < 16) {
4863    subptr(rsp, 64);
4864    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4865    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4866    Assembler::vptest(xmm0, src);
4867    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4868    addptr(rsp, 64);
4869  } else if (dst_enc < 16) {
4870    subptr(rsp, 64);
4871    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4872    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4873    Assembler::vptest(dst, xmm0);
4874    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4875    addptr(rsp, 64);
4876  } else {
4877    subptr(rsp, 64);
4878    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4879    subptr(rsp, 64);
4880    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4881    movdqu(xmm0, src);
4882    movdqu(xmm1, dst);
4883    Assembler::vptest(xmm1, xmm0);
4884    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4885    addptr(rsp, 64);
4886    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4887    addptr(rsp, 64);
4888  }
4889}
4890
4891// This instruction exists within macros, ergo we cannot control its input
4892// when emitted through those patterns.
4893void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
4894  if (VM_Version::supports_avx512nobw()) {
4895    int dst_enc = dst->encoding();
4896    int src_enc = src->encoding();
4897    if (dst_enc == src_enc) {
4898      if (dst_enc < 16) {
4899        Assembler::punpcklbw(dst, src);
4900      } else {
4901        subptr(rsp, 64);
4902        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4903        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4904        Assembler::punpcklbw(xmm0, xmm0);
4905        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4906        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4907        addptr(rsp, 64);
4908      }
4909    } else {
4910      if ((src_enc < 16) && (dst_enc < 16)) {
4911        Assembler::punpcklbw(dst, src);
4912      } else if (src_enc < 16) {
4913        subptr(rsp, 64);
4914        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4915        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4916        Assembler::punpcklbw(xmm0, src);
4917        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4918        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4919        addptr(rsp, 64);
4920      } else if (dst_enc < 16) {
4921        subptr(rsp, 64);
4922        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4923        evmovdqul(xmm0, src, Assembler::AVX_512bit);
4924        Assembler::punpcklbw(dst, xmm0);
4925        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4926        addptr(rsp, 64);
4927      } else {
4928        subptr(rsp, 64);
4929        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4930        subptr(rsp, 64);
4931        evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4932        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4933        evmovdqul(xmm1, src, Assembler::AVX_512bit);
4934        Assembler::punpcklbw(xmm0, xmm1);
4935        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4936        evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4937        addptr(rsp, 64);
4938        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4939        addptr(rsp, 64);
4940      }
4941    }
4942  } else {
4943    Assembler::punpcklbw(dst, src);
4944  }
4945}
4946
4947void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
4948  if (VM_Version::supports_avx512vl()) {
4949    Assembler::pshufd(dst, src, mode);
4950  } else {
4951    int dst_enc = dst->encoding();
4952    if (dst_enc < 16) {
4953      Assembler::pshufd(dst, src, mode);
4954    } else {
4955      subptr(rsp, 64);
4956      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4957      Assembler::pshufd(xmm0, src, mode);
4958      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4959      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4960      addptr(rsp, 64);
4961    }
4962  }
4963}
4964
4965// This instruction exists within macros, ergo we cannot control its input
4966// when emitted through those patterns.
4967void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
4968  if (VM_Version::supports_avx512nobw()) {
4969    int dst_enc = dst->encoding();
4970    int src_enc = src->encoding();
4971    if (dst_enc == src_enc) {
4972      if (dst_enc < 16) {
4973        Assembler::pshuflw(dst, src, mode);
4974      } else {
4975        subptr(rsp, 64);
4976        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4977        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4978        Assembler::pshuflw(xmm0, xmm0, mode);
4979        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4980        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4981        addptr(rsp, 64);
4982      }
4983    } else {
4984      if ((src_enc < 16) && (dst_enc < 16)) {
4985        Assembler::pshuflw(dst, src, mode);
4986      } else if (src_enc < 16) {
4987        subptr(rsp, 64);
4988        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4989        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4990        Assembler::pshuflw(xmm0, src, mode);
4991        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4992        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4993        addptr(rsp, 64);
4994      } else if (dst_enc < 16) {
4995        subptr(rsp, 64);
4996        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4997        evmovdqul(xmm0, src, Assembler::AVX_512bit);
4998        Assembler::pshuflw(dst, xmm0, mode);
4999        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5000        addptr(rsp, 64);
5001      } else {
5002        subptr(rsp, 64);
5003        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5004        subptr(rsp, 64);
5005        evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
5006        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
5007        evmovdqul(xmm1, src, Assembler::AVX_512bit);
5008        Assembler::pshuflw(xmm0, xmm1, mode);
5009        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
5010        evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
5011        addptr(rsp, 64);
5012        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5013        addptr(rsp, 64);
5014      }
5015    }
5016  } else {
5017    Assembler::pshuflw(dst, src, mode);
5018  }
5019}
5020
5021void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
5022  if (reachable(src)) {
5023    vandpd(dst, nds, as_Address(src), vector_len);
5024  } else {
5025    lea(rscratch1, src);
5026    vandpd(dst, nds, Address(rscratch1, 0), vector_len);
5027  }
5028}
5029
5030void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
5031  if (reachable(src)) {
5032    vandps(dst, nds, as_Address(src), vector_len);
5033  } else {
5034    lea(rscratch1, src);
5035    vandps(dst, nds, Address(rscratch1, 0), vector_len);
5036  }
5037}
5038
5039void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5040  if (reachable(src)) {
5041    vdivsd(dst, nds, as_Address(src));
5042  } else {
5043    lea(rscratch1, src);
5044    vdivsd(dst, nds, Address(rscratch1, 0));
5045  }
5046}
5047
5048void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5049  if (reachable(src)) {
5050    vdivss(dst, nds, as_Address(src));
5051  } else {
5052    lea(rscratch1, src);
5053    vdivss(dst, nds, Address(rscratch1, 0));
5054  }
5055}
5056
5057void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5058  if (reachable(src)) {
5059    vmulsd(dst, nds, as_Address(src));
5060  } else {
5061    lea(rscratch1, src);
5062    vmulsd(dst, nds, Address(rscratch1, 0));
5063  }
5064}
5065
5066void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5067  if (reachable(src)) {
5068    vmulss(dst, nds, as_Address(src));
5069  } else {
5070    lea(rscratch1, src);
5071    vmulss(dst, nds, Address(rscratch1, 0));
5072  }
5073}
5074
5075void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5076  if (reachable(src)) {
5077    vsubsd(dst, nds, as_Address(src));
5078  } else {
5079    lea(rscratch1, src);
5080    vsubsd(dst, nds, Address(rscratch1, 0));
5081  }
5082}
5083
5084void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5085  if (reachable(src)) {
5086    vsubss(dst, nds, as_Address(src));
5087  } else {
5088    lea(rscratch1, src);
5089    vsubss(dst, nds, Address(rscratch1, 0));
5090  }
5091}
5092
5093void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5094  int nds_enc = nds->encoding();
5095  int dst_enc = dst->encoding();
5096  bool dst_upper_bank = (dst_enc > 15);
5097  bool nds_upper_bank = (nds_enc > 15);
5098  if (VM_Version::supports_avx512novl() &&
5099      (nds_upper_bank || dst_upper_bank)) {
5100    if (dst_upper_bank) {
5101      subptr(rsp, 64);
5102      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5103      movflt(xmm0, nds);
5104      vxorps(xmm0, xmm0, src, Assembler::AVX_128bit);
5105      movflt(dst, xmm0);
5106      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5107      addptr(rsp, 64);
5108    } else {
5109      movflt(dst, nds);
5110      vxorps(dst, dst, src, Assembler::AVX_128bit);
5111    }
5112  } else {
5113    vxorps(dst, nds, src, Assembler::AVX_128bit);
5114  }
5115}
5116
5117void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5118  int nds_enc = nds->encoding();
5119  int dst_enc = dst->encoding();
5120  bool dst_upper_bank = (dst_enc > 15);
5121  bool nds_upper_bank = (nds_enc > 15);
5122  if (VM_Version::supports_avx512novl() &&
5123      (nds_upper_bank || dst_upper_bank)) {
5124    if (dst_upper_bank) {
5125      subptr(rsp, 64);
5126      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5127      movdbl(xmm0, nds);
5128      vxorpd(xmm0, xmm0, src, Assembler::AVX_128bit);
5129      movdbl(dst, xmm0);
5130      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5131      addptr(rsp, 64);
5132    } else {
5133      movdbl(dst, nds);
5134      vxorpd(dst, dst, src, Assembler::AVX_128bit);
5135    }
5136  } else {
5137    vxorpd(dst, nds, src, Assembler::AVX_128bit);
5138  }
5139}
5140
5141void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
5142  if (reachable(src)) {
5143    vxorpd(dst, nds, as_Address(src), vector_len);
5144  } else {
5145    lea(rscratch1, src);
5146    vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
5147  }
5148}
5149
5150void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
5151  if (reachable(src)) {
5152    vxorps(dst, nds, as_Address(src), vector_len);
5153  } else {
5154    lea(rscratch1, src);
5155    vxorps(dst, nds, Address(rscratch1, 0), vector_len);
5156  }
5157}
5158
5159
5160void MacroAssembler::resolve_jobject(Register value,
5161                                     Register thread,
5162                                     Register tmp) {
5163  assert_different_registers(value, thread, tmp);
5164  Label done, not_weak;
5165  testptr(value, value);
5166  jcc(Assembler::zero, done);                // Use NULL as-is.
5167  testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
5168  jcc(Assembler::zero, not_weak);
5169  // Resolve jweak.
5170  movptr(value, Address(value, -JNIHandles::weak_tag_value));
5171  verify_oop(value);
5172#if INCLUDE_ALL_GCS
5173  if (UseG1GC) {
5174    g1_write_barrier_pre(noreg /* obj */,
5175                         value /* pre_val */,
5176                         thread /* thread */,
5177                         tmp /* tmp */,
5178                         true /* tosca_live */,
5179                         true /* expand_call */);
5180  }
5181#endif // INCLUDE_ALL_GCS
5182  jmp(done);
5183  bind(not_weak);
5184  // Resolve (untagged) jobject.
5185  movptr(value, Address(value, 0));
5186  verify_oop(value);
5187  bind(done);
5188}
5189
5190void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
5191  const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
5192  STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
5193  // The inverted mask is sign-extended
5194  andptr(possibly_jweak, inverted_jweak_mask);
5195}
5196
5197//////////////////////////////////////////////////////////////////////////////////
5198#if INCLUDE_ALL_GCS
5199
5200void MacroAssembler::g1_write_barrier_pre(Register obj,
5201                                          Register pre_val,
5202                                          Register thread,
5203                                          Register tmp,
5204                                          bool tosca_live,
5205                                          bool expand_call) {
5206
5207  // If expand_call is true then we expand the call_VM_leaf macro
5208  // directly to skip generating the check by
5209  // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
5210
5211#ifdef _LP64
5212  assert(thread == r15_thread, "must be");
5213#endif // _LP64
5214
5215  Label done;
5216  Label runtime;
5217
5218  assert(pre_val != noreg, "check this code");
5219
5220  if (obj != noreg) {
5221    assert_different_registers(obj, pre_val, tmp);
5222    assert(pre_val != rax, "check this code");
5223  }
5224
5225  Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
5226                                       SATBMarkQueue::byte_offset_of_active()));
5227  Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
5228                                       SATBMarkQueue::byte_offset_of_index()));
5229  Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
5230                                       SATBMarkQueue::byte_offset_of_buf()));
5231
5232
5233  // Is marking active?
5234  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
5235    cmpl(in_progress, 0);
5236  } else {
5237    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
5238    cmpb(in_progress, 0);
5239  }
5240  jcc(Assembler::equal, done);
5241
5242  // Do we need to load the previous value?
5243  if (obj != noreg) {
5244    load_heap_oop(pre_val, Address(obj, 0));
5245  }
5246
5247  // Is the previous value null?
5248  cmpptr(pre_val, (int32_t) NULL_WORD);
5249  jcc(Assembler::equal, done);
5250
5251  // Can we store original value in the thread's buffer?
5252  // Is index == 0?
5253  // (The index field is typed as size_t.)
5254
5255  movptr(tmp, index);                   // tmp := *index_adr
5256  cmpptr(tmp, 0);                       // tmp == 0?
5257  jcc(Assembler::equal, runtime);       // If yes, goto runtime
5258
5259  subptr(tmp, wordSize);                // tmp := tmp - wordSize
5260  movptr(index, tmp);                   // *index_adr := tmp
5261  addptr(tmp, buffer);                  // tmp := tmp + *buffer_adr
5262
5263  // Record the previous value
5264  movptr(Address(tmp, 0), pre_val);
5265  jmp(done);
5266
5267  bind(runtime);
5268  // save the live input values
5269  if(tosca_live) push(rax);
5270
5271  if (obj != noreg && obj != rax)
5272    push(obj);
5273
5274  if (pre_val != rax)
5275    push(pre_val);
5276
5277  // Calling the runtime using the regular call_VM_leaf mechanism generates
5278  // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
5279  // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
5280  //
5281  // If we care generating the pre-barrier without a frame (e.g. in the
5282  // intrinsified Reference.get() routine) then ebp might be pointing to
5283  // the caller frame and so this check will most likely fail at runtime.
5284  //
5285  // Expanding the call directly bypasses the generation of the check.
5286  // So when we do not have have a full interpreter frame on the stack
5287  // expand_call should be passed true.
5288
5289  NOT_LP64( push(thread); )
5290
5291  if (expand_call) {
5292    LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
5293    pass_arg1(this, thread);
5294    pass_arg0(this, pre_val);
5295    MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
5296  } else {
5297    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
5298  }
5299
5300  NOT_LP64( pop(thread); )
5301
5302  // save the live input values
5303  if (pre_val != rax)
5304    pop(pre_val);
5305
5306  if (obj != noreg && obj != rax)
5307    pop(obj);
5308
5309  if(tosca_live) pop(rax);
5310
5311  bind(done);
5312}
5313
5314void MacroAssembler::g1_write_barrier_post(Register store_addr,
5315                                           Register new_val,
5316                                           Register thread,
5317                                           Register tmp,
5318                                           Register tmp2) {
5319#ifdef _LP64
5320  assert(thread == r15_thread, "must be");
5321#endif // _LP64
5322
5323  Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
5324                                       DirtyCardQueue::byte_offset_of_index()));
5325  Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
5326                                       DirtyCardQueue::byte_offset_of_buf()));
5327
5328  CardTableModRefBS* ct =
5329    barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
5330  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
5331
5332  Label done;
5333  Label runtime;
5334
5335  // Does store cross heap regions?
5336
5337  movptr(tmp, store_addr);
5338  xorptr(tmp, new_val);
5339  shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
5340  jcc(Assembler::equal, done);
5341
5342  // crosses regions, storing NULL?
5343
5344  cmpptr(new_val, (int32_t) NULL_WORD);
5345  jcc(Assembler::equal, done);
5346
5347  // storing region crossing non-NULL, is card already dirty?
5348
5349  const Register card_addr = tmp;
5350  const Register cardtable = tmp2;
5351
5352  movptr(card_addr, store_addr);
5353  shrptr(card_addr, CardTableModRefBS::card_shift);
5354  // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
5355  // a valid address and therefore is not properly handled by the relocation code.
5356  movptr(cardtable, (intptr_t)ct->byte_map_base);
5357  addptr(card_addr, cardtable);
5358
5359  cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val());
5360  jcc(Assembler::equal, done);
5361
5362  membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
5363  cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
5364  jcc(Assembler::equal, done);
5365
5366
5367  // storing a region crossing, non-NULL oop, card is clean.
5368  // dirty card and log.
5369
5370  movb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
5371
5372  cmpl(queue_index, 0);
5373  jcc(Assembler::equal, runtime);
5374  subl(queue_index, wordSize);
5375  movptr(tmp2, buffer);
5376#ifdef _LP64
5377  movslq(rscratch1, queue_index);
5378  addq(tmp2, rscratch1);
5379  movq(Address(tmp2, 0), card_addr);
5380#else
5381  addl(tmp2, queue_index);
5382  movl(Address(tmp2, 0), card_addr);
5383#endif
5384  jmp(done);
5385
5386  bind(runtime);
5387  // save the live input values
5388  push(store_addr);
5389  push(new_val);
5390#ifdef _LP64
5391  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
5392#else
5393  push(thread);
5394  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
5395  pop(thread);
5396#endif
5397  pop(new_val);
5398  pop(store_addr);
5399
5400  bind(done);
5401}
5402
5403#endif // INCLUDE_ALL_GCS
5404//////////////////////////////////////////////////////////////////////////////////
5405
5406
5407void MacroAssembler::store_check(Register obj, Address dst) {
5408  store_check(obj);
5409}
5410
5411void MacroAssembler::store_check(Register obj) {
5412  // Does a store check for the oop in register obj. The content of
5413  // register obj is destroyed afterwards.
5414  BarrierSet* bs = Universe::heap()->barrier_set();
5415  assert(bs->kind() == BarrierSet::CardTableForRS ||
5416         bs->kind() == BarrierSet::CardTableExtension,
5417         "Wrong barrier set kind");
5418
5419  CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
5420  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
5421
5422  shrptr(obj, CardTableModRefBS::card_shift);
5423
5424  Address card_addr;
5425
5426  // The calculation for byte_map_base is as follows:
5427  // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
5428  // So this essentially converts an address to a displacement and it will
5429  // never need to be relocated. On 64bit however the value may be too
5430  // large for a 32bit displacement.
5431  intptr_t disp = (intptr_t) ct->byte_map_base;
5432  if (is_simm32(disp)) {
5433    card_addr = Address(noreg, obj, Address::times_1, disp);
5434  } else {
5435    // By doing it as an ExternalAddress 'disp' could be converted to a rip-relative
5436    // displacement and done in a single instruction given favorable mapping and a
5437    // smarter version of as_Address. However, 'ExternalAddress' generates a relocation
5438    // entry and that entry is not properly handled by the relocation code.
5439    AddressLiteral cardtable((address)ct->byte_map_base, relocInfo::none);
5440    Address index(noreg, obj, Address::times_1);
5441    card_addr = as_Address(ArrayAddress(cardtable, index));
5442  }
5443
5444  int dirty = CardTableModRefBS::dirty_card_val();
5445  if (UseCondCardMark) {
5446    Label L_already_dirty;
5447    if (UseConcMarkSweepGC) {
5448      membar(Assembler::StoreLoad);
5449    }
5450    cmpb(card_addr, dirty);
5451    jcc(Assembler::equal, L_already_dirty);
5452    movb(card_addr, dirty);
5453    bind(L_already_dirty);
5454  } else {
5455    movb(card_addr, dirty);
5456  }
5457}
5458
5459void MacroAssembler::subptr(Register dst, int32_t imm32) {
5460  LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
5461}
5462
5463// Force generation of a 4 byte immediate value even if it fits into 8bit
5464void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
5465  LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
5466}
5467
5468void MacroAssembler::subptr(Register dst, Register src) {
5469  LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
5470}
5471
5472// C++ bool manipulation
5473void MacroAssembler::testbool(Register dst) {
5474  if(sizeof(bool) == 1)
5475    testb(dst, 0xff);
5476  else if(sizeof(bool) == 2) {
5477    // testw implementation needed for two byte bools
5478    ShouldNotReachHere();
5479  } else if(sizeof(bool) == 4)
5480    testl(dst, dst);
5481  else
5482    // unsupported
5483    ShouldNotReachHere();
5484}
5485
5486void MacroAssembler::testptr(Register dst, Register src) {
5487  LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
5488}
5489
5490// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
5491void MacroAssembler::tlab_allocate(Register obj,
5492                                   Register var_size_in_bytes,
5493                                   int con_size_in_bytes,
5494                                   Register t1,
5495                                   Register t2,
5496                                   Label& slow_case) {
5497  assert_different_registers(obj, t1, t2);
5498  assert_different_registers(obj, var_size_in_bytes, t1);
5499  Register end = t2;
5500  Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
5501
5502  verify_tlab();
5503
5504  NOT_LP64(get_thread(thread));
5505
5506  movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
5507  if (var_size_in_bytes == noreg) {
5508    lea(end, Address(obj, con_size_in_bytes));
5509  } else {
5510    lea(end, Address(obj, var_size_in_bytes, Address::times_1));
5511  }
5512  cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
5513  jcc(Assembler::above, slow_case);
5514
5515  // update the tlab top pointer
5516  movptr(Address(thread, JavaThread::tlab_top_offset()), end);
5517
5518  // recover var_size_in_bytes if necessary
5519  if (var_size_in_bytes == end) {
5520    subptr(var_size_in_bytes, obj);
5521  }
5522  verify_tlab();
5523}
5524
5525// Preserves rbx, and rdx.
5526Register MacroAssembler::tlab_refill(Label& retry,
5527                                     Label& try_eden,
5528                                     Label& slow_case) {
5529  Register top = rax;
5530  Register t1  = rcx; // object size
5531  Register t2  = rsi;
5532  Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
5533  assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
5534  Label do_refill, discard_tlab;
5535
5536  if (!Universe::heap()->supports_inline_contig_alloc()) {
5537    // No allocation in the shared eden.
5538    jmp(slow_case);
5539  }
5540
5541  NOT_LP64(get_thread(thread_reg));
5542
5543  movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5544  movptr(t1,  Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
5545
5546  // calculate amount of free space
5547  subptr(t1, top);
5548  shrptr(t1, LogHeapWordSize);
5549
5550  // Retain tlab and allocate object in shared space if
5551  // the amount free in the tlab is too large to discard.
5552  cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
5553  jcc(Assembler::lessEqual, discard_tlab);
5554
5555  // Retain
5556  // %%% yuck as movptr...
5557  movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
5558  addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2);
5559  if (TLABStats) {
5560    // increment number of slow_allocations
5561    addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1);
5562  }
5563  jmp(try_eden);
5564
5565  bind(discard_tlab);
5566  if (TLABStats) {
5567    // increment number of refills
5568    addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1);
5569    // accumulate wastage -- t1 is amount free in tlab
5570    addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1);
5571  }
5572
5573  // if tlab is currently allocated (top or end != null) then
5574  // fill [top, end + alignment_reserve) with array object
5575  testptr(top, top);
5576  jcc(Assembler::zero, do_refill);
5577
5578  // set up the mark word
5579  movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
5580  // set the length to the remaining space
5581  subptr(t1, typeArrayOopDesc::header_size(T_INT));
5582  addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
5583  shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
5584  movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
5585  // set klass to intArrayKlass
5586  // dubious reloc why not an oop reloc?
5587  movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
5588  // store klass last.  concurrent gcs assumes klass length is valid if
5589  // klass field is not null.
5590  store_klass(top, t1);
5591
5592  movptr(t1, top);
5593  subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
5594  incr_allocated_bytes(thread_reg, t1, 0);
5595
5596  // refill the tlab with an eden allocation
5597  bind(do_refill);
5598  movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
5599  shlptr(t1, LogHeapWordSize);
5600  // allocate new tlab, address returned in top
5601  eden_allocate(top, t1, 0, t2, slow_case);
5602
5603  // Check that t1 was preserved in eden_allocate.
5604#ifdef ASSERT
5605  if (UseTLAB) {
5606    Label ok;
5607    Register tsize = rsi;
5608    assert_different_registers(tsize, thread_reg, t1);
5609    push(tsize);
5610    movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
5611    shlptr(tsize, LogHeapWordSize);
5612    cmpptr(t1, tsize);
5613    jcc(Assembler::equal, ok);
5614    STOP("assert(t1 != tlab size)");
5615    should_not_reach_here();
5616
5617    bind(ok);
5618    pop(tsize);
5619  }
5620#endif
5621  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top);
5622  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top);
5623  addptr(top, t1);
5624  subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
5625  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
5626
5627  if (ZeroTLAB) {
5628    // This is a fast TLAB refill, therefore the GC is not notified of it.
5629    // So compiled code must fill the new TLAB with zeroes.
5630    movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
5631    zero_memory(top, t1, 0, t2);
5632  }
5633
5634  verify_tlab();
5635  jmp(retry);
5636
5637  return thread_reg; // for use by caller
5638}
5639
5640// Preserves the contents of address, destroys the contents length_in_bytes and temp.
5641void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
5642  assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
5643  assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
5644  Label done;
5645
5646  testptr(length_in_bytes, length_in_bytes);
5647  jcc(Assembler::zero, done);
5648
5649  // initialize topmost word, divide index by 2, check if odd and test if zero
5650  // note: for the remaining code to work, index must be a multiple of BytesPerWord
5651#ifdef ASSERT
5652  {
5653    Label L;
5654    testptr(length_in_bytes, BytesPerWord - 1);
5655    jcc(Assembler::zero, L);
5656    stop("length must be a multiple of BytesPerWord");
5657    bind(L);
5658  }
5659#endif
5660  Register index = length_in_bytes;
5661  xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
5662  if (UseIncDec) {
5663    shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
5664  } else {
5665    shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
5666    shrptr(index, 1);
5667  }
5668#ifndef _LP64
5669  // index could have not been a multiple of 8 (i.e., bit 2 was set)
5670  {
5671    Label even;
5672    // note: if index was a multiple of 8, then it cannot
5673    //       be 0 now otherwise it must have been 0 before
5674    //       => if it is even, we don't need to check for 0 again
5675    jcc(Assembler::carryClear, even);
5676    // clear topmost word (no jump would be needed if conditional assignment worked here)
5677    movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
5678    // index could be 0 now, must check again
5679    jcc(Assembler::zero, done);
5680    bind(even);
5681  }
5682#endif // !_LP64
5683  // initialize remaining object fields: index is a multiple of 2 now
5684  {
5685    Label loop;
5686    bind(loop);
5687    movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
5688    NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
5689    decrement(index);
5690    jcc(Assembler::notZero, loop);
5691  }
5692
5693  bind(done);
5694}
5695
5696void MacroAssembler::incr_allocated_bytes(Register thread,
5697                                          Register var_size_in_bytes,
5698                                          int con_size_in_bytes,
5699                                          Register t1) {
5700  if (!thread->is_valid()) {
5701#ifdef _LP64
5702    thread = r15_thread;
5703#else
5704    assert(t1->is_valid(), "need temp reg");
5705    thread = t1;
5706    get_thread(thread);
5707#endif
5708  }
5709
5710#ifdef _LP64
5711  if (var_size_in_bytes->is_valid()) {
5712    addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
5713  } else {
5714    addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
5715  }
5716#else
5717  if (var_size_in_bytes->is_valid()) {
5718    addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
5719  } else {
5720    addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
5721  }
5722  adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
5723#endif
5724}
5725
5726// Look up the method for a megamorphic invokeinterface call.
5727// The target method is determined by <intf_klass, itable_index>.
5728// The receiver klass is in recv_klass.
5729// On success, the result will be in method_result, and execution falls through.
5730// On failure, execution transfers to the given label.
5731void MacroAssembler::lookup_interface_method(Register recv_klass,
5732                                             Register intf_klass,
5733                                             RegisterOrConstant itable_index,
5734                                             Register method_result,
5735                                             Register scan_temp,
5736                                             Label& L_no_such_interface) {
5737  assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
5738  assert(itable_index.is_constant() || itable_index.as_register() == method_result,
5739         "caller must use same register for non-constant itable index as for method");
5740
5741  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
5742  int vtable_base = in_bytes(Klass::vtable_start_offset());
5743  int itentry_off = itableMethodEntry::method_offset_in_bytes();
5744  int scan_step   = itableOffsetEntry::size() * wordSize;
5745  int vte_size    = vtableEntry::size_in_bytes();
5746  Address::ScaleFactor times_vte_scale = Address::times_ptr;
5747  assert(vte_size == wordSize, "else adjust times_vte_scale");
5748
5749  movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
5750
5751  // %%% Could store the aligned, prescaled offset in the klassoop.
5752  lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
5753
5754  // Adjust recv_klass by scaled itable_index, so we can free itable_index.
5755  assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
5756  lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
5757
5758  // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
5759  //   if (scan->interface() == intf) {
5760  //     result = (klass + scan->offset() + itable_index);
5761  //   }
5762  // }
5763  Label search, found_method;
5764
5765  for (int peel = 1; peel >= 0; peel--) {
5766    movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
5767    cmpptr(intf_klass, method_result);
5768
5769    if (peel) {
5770      jccb(Assembler::equal, found_method);
5771    } else {
5772      jccb(Assembler::notEqual, search);
5773      // (invert the test to fall through to found_method...)
5774    }
5775
5776    if (!peel)  break;
5777
5778    bind(search);
5779
5780    // Check that the previous entry is non-null.  A null entry means that
5781    // the receiver class doesn't implement the interface, and wasn't the
5782    // same as when the caller was compiled.
5783    testptr(method_result, method_result);
5784    jcc(Assembler::zero, L_no_such_interface);
5785    addptr(scan_temp, scan_step);
5786  }
5787
5788  bind(found_method);
5789
5790  // Got a hit.
5791  movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
5792  movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
5793}
5794
5795
5796// virtual method calling
5797void MacroAssembler::lookup_virtual_method(Register recv_klass,
5798                                           RegisterOrConstant vtable_index,
5799                                           Register method_result) {
5800  const int base = in_bytes(Klass::vtable_start_offset());
5801  assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
5802  Address vtable_entry_addr(recv_klass,
5803                            vtable_index, Address::times_ptr,
5804                            base + vtableEntry::method_offset_in_bytes());
5805  movptr(method_result, vtable_entry_addr);
5806}
5807
5808
5809void MacroAssembler::check_klass_subtype(Register sub_klass,
5810                           Register super_klass,
5811                           Register temp_reg,
5812                           Label& L_success) {
5813  Label L_failure;
5814  check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
5815  check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
5816  bind(L_failure);
5817}
5818
5819
5820void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
5821                                                   Register super_klass,
5822                                                   Register temp_reg,
5823                                                   Label* L_success,
5824                                                   Label* L_failure,
5825                                                   Label* L_slow_path,
5826                                        RegisterOrConstant super_check_offset) {
5827  assert_different_registers(sub_klass, super_klass, temp_reg);
5828  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
5829  if (super_check_offset.is_register()) {
5830    assert_different_registers(sub_klass, super_klass,
5831                               super_check_offset.as_register());
5832  } else if (must_load_sco) {
5833    assert(temp_reg != noreg, "supply either a temp or a register offset");
5834  }
5835
5836  Label L_fallthrough;
5837  int label_nulls = 0;
5838  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
5839  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
5840  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
5841  assert(label_nulls <= 1, "at most one NULL in the batch");
5842
5843  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
5844  int sco_offset = in_bytes(Klass::super_check_offset_offset());
5845  Address super_check_offset_addr(super_klass, sco_offset);
5846
5847  // Hacked jcc, which "knows" that L_fallthrough, at least, is in
5848  // range of a jccb.  If this routine grows larger, reconsider at
5849  // least some of these.
5850#define local_jcc(assembler_cond, label)                                \
5851  if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
5852  else                             jcc( assembler_cond, label) /*omit semi*/
5853
5854  // Hacked jmp, which may only be used just before L_fallthrough.
5855#define final_jmp(label)                                                \
5856  if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
5857  else                            jmp(label)                /*omit semi*/
5858
5859  // If the pointers are equal, we are done (e.g., String[] elements).
5860  // This self-check enables sharing of secondary supertype arrays among
5861  // non-primary types such as array-of-interface.  Otherwise, each such
5862  // type would need its own customized SSA.
5863  // We move this check to the front of the fast path because many
5864  // type checks are in fact trivially successful in this manner,
5865  // so we get a nicely predicted branch right at the start of the check.
5866  cmpptr(sub_klass, super_klass);
5867  local_jcc(Assembler::equal, *L_success);
5868
5869  // Check the supertype display:
5870  if (must_load_sco) {
5871    // Positive movl does right thing on LP64.
5872    movl(temp_reg, super_check_offset_addr);
5873    super_check_offset = RegisterOrConstant(temp_reg);
5874  }
5875  Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
5876  cmpptr(super_klass, super_check_addr); // load displayed supertype
5877
5878  // This check has worked decisively for primary supers.
5879  // Secondary supers are sought in the super_cache ('super_cache_addr').
5880  // (Secondary supers are interfaces and very deeply nested subtypes.)
5881  // This works in the same check above because of a tricky aliasing
5882  // between the super_cache and the primary super display elements.
5883  // (The 'super_check_addr' can address either, as the case requires.)
5884  // Note that the cache is updated below if it does not help us find
5885  // what we need immediately.
5886  // So if it was a primary super, we can just fail immediately.
5887  // Otherwise, it's the slow path for us (no success at this point).
5888
5889  if (super_check_offset.is_register()) {
5890    local_jcc(Assembler::equal, *L_success);
5891    cmpl(super_check_offset.as_register(), sc_offset);
5892    if (L_failure == &L_fallthrough) {
5893      local_jcc(Assembler::equal, *L_slow_path);
5894    } else {
5895      local_jcc(Assembler::notEqual, *L_failure);
5896      final_jmp(*L_slow_path);
5897    }
5898  } else if (super_check_offset.as_constant() == sc_offset) {
5899    // Need a slow path; fast failure is impossible.
5900    if (L_slow_path == &L_fallthrough) {
5901      local_jcc(Assembler::equal, *L_success);
5902    } else {
5903      local_jcc(Assembler::notEqual, *L_slow_path);
5904      final_jmp(*L_success);
5905    }
5906  } else {
5907    // No slow path; it's a fast decision.
5908    if (L_failure == &L_fallthrough) {
5909      local_jcc(Assembler::equal, *L_success);
5910    } else {
5911      local_jcc(Assembler::notEqual, *L_failure);
5912      final_jmp(*L_success);
5913    }
5914  }
5915
5916  bind(L_fallthrough);
5917
5918#undef local_jcc
5919#undef final_jmp
5920}
5921
5922
5923void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
5924                                                   Register super_klass,
5925                                                   Register temp_reg,
5926                                                   Register temp2_reg,
5927                                                   Label* L_success,
5928                                                   Label* L_failure,
5929                                                   bool set_cond_codes) {
5930  assert_different_registers(sub_klass, super_klass, temp_reg);
5931  if (temp2_reg != noreg)
5932    assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
5933#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
5934
5935  Label L_fallthrough;
5936  int label_nulls = 0;
5937  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
5938  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
5939  assert(label_nulls <= 1, "at most one NULL in the batch");
5940
5941  // a couple of useful fields in sub_klass:
5942  int ss_offset = in_bytes(Klass::secondary_supers_offset());
5943  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
5944  Address secondary_supers_addr(sub_klass, ss_offset);
5945  Address super_cache_addr(     sub_klass, sc_offset);
5946
5947  // Do a linear scan of the secondary super-klass chain.
5948  // This code is rarely used, so simplicity is a virtue here.
5949  // The repne_scan instruction uses fixed registers, which we must spill.
5950  // Don't worry too much about pre-existing connections with the input regs.
5951
5952  assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
5953  assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
5954
5955  // Get super_klass value into rax (even if it was in rdi or rcx).
5956  bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
5957  if (super_klass != rax || UseCompressedOops) {
5958    if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
5959    mov(rax, super_klass);
5960  }
5961  if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
5962  if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
5963
5964#ifndef PRODUCT
5965  int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
5966  ExternalAddress pst_counter_addr((address) pst_counter);
5967  NOT_LP64(  incrementl(pst_counter_addr) );
5968  LP64_ONLY( lea(rcx, pst_counter_addr) );
5969  LP64_ONLY( incrementl(Address(rcx, 0)) );
5970#endif //PRODUCT
5971
5972  // We will consult the secondary-super array.
5973  movptr(rdi, secondary_supers_addr);
5974  // Load the array length.  (Positive movl does right thing on LP64.)
5975  movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
5976  // Skip to start of data.
5977  addptr(rdi, Array<Klass*>::base_offset_in_bytes());
5978
5979  // Scan RCX words at [RDI] for an occurrence of RAX.
5980  // Set NZ/Z based on last compare.
5981  // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
5982  // not change flags (only scas instruction which is repeated sets flags).
5983  // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
5984
5985    testptr(rax,rax); // Set Z = 0
5986    repne_scan();
5987
5988  // Unspill the temp. registers:
5989  if (pushed_rdi)  pop(rdi);
5990  if (pushed_rcx)  pop(rcx);
5991  if (pushed_rax)  pop(rax);
5992
5993  if (set_cond_codes) {
5994    // Special hack for the AD files:  rdi is guaranteed non-zero.
5995    assert(!pushed_rdi, "rdi must be left non-NULL");
5996    // Also, the condition codes are properly set Z/NZ on succeed/failure.
5997  }
5998
5999  if (L_failure == &L_fallthrough)
6000        jccb(Assembler::notEqual, *L_failure);
6001  else  jcc(Assembler::notEqual, *L_failure);
6002
6003  // Success.  Cache the super we found and proceed in triumph.
6004  movptr(super_cache_addr, super_klass);
6005
6006  if (L_success != &L_fallthrough) {
6007    jmp(*L_success);
6008  }
6009
6010#undef IS_A_TEMP
6011
6012  bind(L_fallthrough);
6013}
6014
6015
6016void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
6017  if (VM_Version::supports_cmov()) {
6018    cmovl(cc, dst, src);
6019  } else {
6020    Label L;
6021    jccb(negate_condition(cc), L);
6022    movl(dst, src);
6023    bind(L);
6024  }
6025}
6026
6027void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
6028  if (VM_Version::supports_cmov()) {
6029    cmovl(cc, dst, src);
6030  } else {
6031    Label L;
6032    jccb(negate_condition(cc), L);
6033    movl(dst, src);
6034    bind(L);
6035  }
6036}
6037
6038void MacroAssembler::verify_oop(Register reg, const char* s) {
6039  if (!VerifyOops) return;
6040
6041  // Pass register number to verify_oop_subroutine
6042  const char* b = NULL;
6043  {
6044    ResourceMark rm;
6045    stringStream ss;
6046    ss.print("verify_oop: %s: %s", reg->name(), s);
6047    b = code_string(ss.as_string());
6048  }
6049  BLOCK_COMMENT("verify_oop {");
6050#ifdef _LP64
6051  push(rscratch1);                    // save r10, trashed by movptr()
6052#endif
6053  push(rax);                          // save rax,
6054  push(reg);                          // pass register argument
6055  ExternalAddress buffer((address) b);
6056  // avoid using pushptr, as it modifies scratch registers
6057  // and our contract is not to modify anything
6058  movptr(rax, buffer.addr());
6059  push(rax);
6060  // call indirectly to solve generation ordering problem
6061  movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
6062  call(rax);
6063  // Caller pops the arguments (oop, message) and restores rax, r10
6064  BLOCK_COMMENT("} verify_oop");
6065}
6066
6067
6068RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
6069                                                      Register tmp,
6070                                                      int offset) {
6071  intptr_t value = *delayed_value_addr;
6072  if (value != 0)
6073    return RegisterOrConstant(value + offset);
6074
6075  // load indirectly to solve generation ordering problem
6076  movptr(tmp, ExternalAddress((address) delayed_value_addr));
6077
6078#ifdef ASSERT
6079  { Label L;
6080    testptr(tmp, tmp);
6081    if (WizardMode) {
6082      const char* buf = NULL;
6083      {
6084        ResourceMark rm;
6085        stringStream ss;
6086        ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]);
6087        buf = code_string(ss.as_string());
6088      }
6089      jcc(Assembler::notZero, L);
6090      STOP(buf);
6091    } else {
6092      jccb(Assembler::notZero, L);
6093      hlt();
6094    }
6095    bind(L);
6096  }
6097#endif
6098
6099  if (offset != 0)
6100    addptr(tmp, offset);
6101
6102  return RegisterOrConstant(tmp);
6103}
6104
6105
6106Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
6107                                         int extra_slot_offset) {
6108  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
6109  int stackElementSize = Interpreter::stackElementSize;
6110  int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
6111#ifdef ASSERT
6112  int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
6113  assert(offset1 - offset == stackElementSize, "correct arithmetic");
6114#endif
6115  Register             scale_reg    = noreg;
6116  Address::ScaleFactor scale_factor = Address::no_scale;
6117  if (arg_slot.is_constant()) {
6118    offset += arg_slot.as_constant() * stackElementSize;
6119  } else {
6120    scale_reg    = arg_slot.as_register();
6121    scale_factor = Address::times(stackElementSize);
6122  }
6123  offset += wordSize;           // return PC is on stack
6124  return Address(rsp, scale_reg, scale_factor, offset);
6125}
6126
6127
6128void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
6129  if (!VerifyOops) return;
6130
6131  // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
6132  // Pass register number to verify_oop_subroutine
6133  const char* b = NULL;
6134  {
6135    ResourceMark rm;
6136    stringStream ss;
6137    ss.print("verify_oop_addr: %s", s);
6138    b = code_string(ss.as_string());
6139  }
6140#ifdef _LP64
6141  push(rscratch1);                    // save r10, trashed by movptr()
6142#endif
6143  push(rax);                          // save rax,
6144  // addr may contain rsp so we will have to adjust it based on the push
6145  // we just did (and on 64 bit we do two pushes)
6146  // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
6147  // stores rax into addr which is backwards of what was intended.
6148  if (addr.uses(rsp)) {
6149    lea(rax, addr);
6150    pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
6151  } else {
6152    pushptr(addr);
6153  }
6154
6155  ExternalAddress buffer((address) b);
6156  // pass msg argument
6157  // avoid using pushptr, as it modifies scratch registers
6158  // and our contract is not to modify anything
6159  movptr(rax, buffer.addr());
6160  push(rax);
6161
6162  // call indirectly to solve generation ordering problem
6163  movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
6164  call(rax);
6165  // Caller pops the arguments (addr, message) and restores rax, r10.
6166}
6167
6168void MacroAssembler::verify_tlab() {
6169#ifdef ASSERT
6170  if (UseTLAB && VerifyOops) {
6171    Label next, ok;
6172    Register t1 = rsi;
6173    Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
6174
6175    push(t1);
6176    NOT_LP64(push(thread_reg));
6177    NOT_LP64(get_thread(thread_reg));
6178
6179    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
6180    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
6181    jcc(Assembler::aboveEqual, next);
6182    STOP("assert(top >= start)");
6183    should_not_reach_here();
6184
6185    bind(next);
6186    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
6187    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
6188    jcc(Assembler::aboveEqual, ok);
6189    STOP("assert(top <= end)");
6190    should_not_reach_here();
6191
6192    bind(ok);
6193    NOT_LP64(pop(thread_reg));
6194    pop(t1);
6195  }
6196#endif
6197}
6198
6199class ControlWord {
6200 public:
6201  int32_t _value;
6202
6203  int  rounding_control() const        { return  (_value >> 10) & 3      ; }
6204  int  precision_control() const       { return  (_value >>  8) & 3      ; }
6205  bool precision() const               { return ((_value >>  5) & 1) != 0; }
6206  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
6207  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
6208  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
6209  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
6210  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
6211
6212  void print() const {
6213    // rounding control
6214    const char* rc;
6215    switch (rounding_control()) {
6216      case 0: rc = "round near"; break;
6217      case 1: rc = "round down"; break;
6218      case 2: rc = "round up  "; break;
6219      case 3: rc = "chop      "; break;
6220    };
6221    // precision control
6222    const char* pc;
6223    switch (precision_control()) {
6224      case 0: pc = "24 bits "; break;
6225      case 1: pc = "reserved"; break;
6226      case 2: pc = "53 bits "; break;
6227      case 3: pc = "64 bits "; break;
6228    };
6229    // flags
6230    char f[9];
6231    f[0] = ' ';
6232    f[1] = ' ';
6233    f[2] = (precision   ()) ? 'P' : 'p';
6234    f[3] = (underflow   ()) ? 'U' : 'u';
6235    f[4] = (overflow    ()) ? 'O' : 'o';
6236    f[5] = (zero_divide ()) ? 'Z' : 'z';
6237    f[6] = (denormalized()) ? 'D' : 'd';
6238    f[7] = (invalid     ()) ? 'I' : 'i';
6239    f[8] = '\x0';
6240    // output
6241    printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
6242  }
6243
6244};
6245
6246class StatusWord {
6247 public:
6248  int32_t _value;
6249
6250  bool busy() const                    { return ((_value >> 15) & 1) != 0; }
6251  bool C3() const                      { return ((_value >> 14) & 1) != 0; }
6252  bool C2() const                      { return ((_value >> 10) & 1) != 0; }
6253  bool C1() const                      { return ((_value >>  9) & 1) != 0; }
6254  bool C0() const                      { return ((_value >>  8) & 1) != 0; }
6255  int  top() const                     { return  (_value >> 11) & 7      ; }
6256  bool error_status() const            { return ((_value >>  7) & 1) != 0; }
6257  bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
6258  bool precision() const               { return ((_value >>  5) & 1) != 0; }
6259  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
6260  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
6261  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
6262  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
6263  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
6264
6265  void print() const {
6266    // condition codes
6267    char c[5];
6268    c[0] = (C3()) ? '3' : '-';
6269    c[1] = (C2()) ? '2' : '-';
6270    c[2] = (C1()) ? '1' : '-';
6271    c[3] = (C0()) ? '0' : '-';
6272    c[4] = '\x0';
6273    // flags
6274    char f[9];
6275    f[0] = (error_status()) ? 'E' : '-';
6276    f[1] = (stack_fault ()) ? 'S' : '-';
6277    f[2] = (precision   ()) ? 'P' : '-';
6278    f[3] = (underflow   ()) ? 'U' : '-';
6279    f[4] = (overflow    ()) ? 'O' : '-';
6280    f[5] = (zero_divide ()) ? 'Z' : '-';
6281    f[6] = (denormalized()) ? 'D' : '-';
6282    f[7] = (invalid     ()) ? 'I' : '-';
6283    f[8] = '\x0';
6284    // output
6285    printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
6286  }
6287
6288};
6289
6290class TagWord {
6291 public:
6292  int32_t _value;
6293
6294  int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
6295
6296  void print() const {
6297    printf("%04x", _value & 0xFFFF);
6298  }
6299
6300};
6301
6302class FPU_Register {
6303 public:
6304  int32_t _m0;
6305  int32_t _m1;
6306  int16_t _ex;
6307
6308  bool is_indefinite() const           {
6309    return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
6310  }
6311
6312  void print() const {
6313    char  sign = (_ex < 0) ? '-' : '+';
6314    const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
6315    printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
6316  };
6317
6318};
6319
6320class FPU_State {
6321 public:
6322  enum {
6323    register_size       = 10,
6324    number_of_registers =  8,
6325    register_mask       =  7
6326  };
6327
6328  ControlWord  _control_word;
6329  StatusWord   _status_word;
6330  TagWord      _tag_word;
6331  int32_t      _error_offset;
6332  int32_t      _error_selector;
6333  int32_t      _data_offset;
6334  int32_t      _data_selector;
6335  int8_t       _register[register_size * number_of_registers];
6336
6337  int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
6338  FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
6339
6340  const char* tag_as_string(int tag) const {
6341    switch (tag) {
6342      case 0: return "valid";
6343      case 1: return "zero";
6344      case 2: return "special";
6345      case 3: return "empty";
6346    }
6347    ShouldNotReachHere();
6348    return NULL;
6349  }
6350
6351  void print() const {
6352    // print computation registers
6353    { int t = _status_word.top();
6354      for (int i = 0; i < number_of_registers; i++) {
6355        int j = (i - t) & register_mask;
6356        printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
6357        st(j)->print();
6358        printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
6359      }
6360    }
6361    printf("\n");
6362    // print control registers
6363    printf("ctrl = "); _control_word.print(); printf("\n");
6364    printf("stat = "); _status_word .print(); printf("\n");
6365    printf("tags = "); _tag_word    .print(); printf("\n");
6366  }
6367
6368};
6369
6370class Flag_Register {
6371 public:
6372  int32_t _value;
6373
6374  bool overflow() const                { return ((_value >> 11) & 1) != 0; }
6375  bool direction() const               { return ((_value >> 10) & 1) != 0; }
6376  bool sign() const                    { return ((_value >>  7) & 1) != 0; }
6377  bool zero() const                    { return ((_value >>  6) & 1) != 0; }
6378  bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
6379  bool parity() const                  { return ((_value >>  2) & 1) != 0; }
6380  bool carry() const                   { return ((_value >>  0) & 1) != 0; }
6381
6382  void print() const {
6383    // flags
6384    char f[8];
6385    f[0] = (overflow       ()) ? 'O' : '-';
6386    f[1] = (direction      ()) ? 'D' : '-';
6387    f[2] = (sign           ()) ? 'S' : '-';
6388    f[3] = (zero           ()) ? 'Z' : '-';
6389    f[4] = (auxiliary_carry()) ? 'A' : '-';
6390    f[5] = (parity         ()) ? 'P' : '-';
6391    f[6] = (carry          ()) ? 'C' : '-';
6392    f[7] = '\x0';
6393    // output
6394    printf("%08x  flags = %s", _value, f);
6395  }
6396
6397};
6398
6399class IU_Register {
6400 public:
6401  int32_t _value;
6402
6403  void print() const {
6404    printf("%08x  %11d", _value, _value);
6405  }
6406
6407};
6408
6409class IU_State {
6410 public:
6411  Flag_Register _eflags;
6412  IU_Register   _rdi;
6413  IU_Register   _rsi;
6414  IU_Register   _rbp;
6415  IU_Register   _rsp;
6416  IU_Register   _rbx;
6417  IU_Register   _rdx;
6418  IU_Register   _rcx;
6419  IU_Register   _rax;
6420
6421  void print() const {
6422    // computation registers
6423    printf("rax,  = "); _rax.print(); printf("\n");
6424    printf("rbx,  = "); _rbx.print(); printf("\n");
6425    printf("rcx  = "); _rcx.print(); printf("\n");
6426    printf("rdx  = "); _rdx.print(); printf("\n");
6427    printf("rdi  = "); _rdi.print(); printf("\n");
6428    printf("rsi  = "); _rsi.print(); printf("\n");
6429    printf("rbp,  = "); _rbp.print(); printf("\n");
6430    printf("rsp  = "); _rsp.print(); printf("\n");
6431    printf("\n");
6432    // control registers
6433    printf("flgs = "); _eflags.print(); printf("\n");
6434  }
6435};
6436
6437
6438class CPU_State {
6439 public:
6440  FPU_State _fpu_state;
6441  IU_State  _iu_state;
6442
6443  void print() const {
6444    printf("--------------------------------------------------\n");
6445    _iu_state .print();
6446    printf("\n");
6447    _fpu_state.print();
6448    printf("--------------------------------------------------\n");
6449  }
6450
6451};
6452
6453
6454static void _print_CPU_state(CPU_State* state) {
6455  state->print();
6456};
6457
6458
6459void MacroAssembler::print_CPU_state() {
6460  push_CPU_state();
6461  push(rsp);                // pass CPU state
6462  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
6463  addptr(rsp, wordSize);       // discard argument
6464  pop_CPU_state();
6465}
6466
6467
6468static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
6469  static int counter = 0;
6470  FPU_State* fs = &state->_fpu_state;
6471  counter++;
6472  // For leaf calls, only verify that the top few elements remain empty.
6473  // We only need 1 empty at the top for C2 code.
6474  if( stack_depth < 0 ) {
6475    if( fs->tag_for_st(7) != 3 ) {
6476      printf("FPR7 not empty\n");
6477      state->print();
6478      assert(false, "error");
6479      return false;
6480    }
6481    return true;                // All other stack states do not matter
6482  }
6483
6484  assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
6485         "bad FPU control word");
6486
6487  // compute stack depth
6488  int i = 0;
6489  while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
6490  int d = i;
6491  while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
6492  // verify findings
6493  if (i != FPU_State::number_of_registers) {
6494    // stack not contiguous
6495    printf("%s: stack not contiguous at ST%d\n", s, i);
6496    state->print();
6497    assert(false, "error");
6498    return false;
6499  }
6500  // check if computed stack depth corresponds to expected stack depth
6501  if (stack_depth < 0) {
6502    // expected stack depth is -stack_depth or less
6503    if (d > -stack_depth) {
6504      // too many elements on the stack
6505      printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
6506      state->print();
6507      assert(false, "error");
6508      return false;
6509    }
6510  } else {
6511    // expected stack depth is stack_depth
6512    if (d != stack_depth) {
6513      // wrong stack depth
6514      printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
6515      state->print();
6516      assert(false, "error");
6517      return false;
6518    }
6519  }
6520  // everything is cool
6521  return true;
6522}
6523
6524
6525void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
6526  if (!VerifyFPU) return;
6527  push_CPU_state();
6528  push(rsp);                // pass CPU state
6529  ExternalAddress msg((address) s);
6530  // pass message string s
6531  pushptr(msg.addr());
6532  push(stack_depth);        // pass stack depth
6533  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
6534  addptr(rsp, 3 * wordSize);   // discard arguments
6535  // check for error
6536  { Label L;
6537    testl(rax, rax);
6538    jcc(Assembler::notZero, L);
6539    int3();                  // break if error condition
6540    bind(L);
6541  }
6542  pop_CPU_state();
6543}
6544
6545void MacroAssembler::restore_cpu_control_state_after_jni() {
6546  // Either restore the MXCSR register after returning from the JNI Call
6547  // or verify that it wasn't changed (with -Xcheck:jni flag).
6548  if (VM_Version::supports_sse()) {
6549    if (RestoreMXCSROnJNICalls) {
6550      ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
6551    } else if (CheckJNICalls) {
6552      call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
6553    }
6554  }
6555  if (VM_Version::supports_avx()) {
6556    // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
6557    vzeroupper();
6558  }
6559
6560#ifndef _LP64
6561  // Either restore the x87 floating pointer control word after returning
6562  // from the JNI call or verify that it wasn't changed.
6563  if (CheckJNICalls) {
6564    call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
6565  }
6566#endif // _LP64
6567}
6568
6569void MacroAssembler::load_mirror(Register mirror, Register method) {
6570  // get mirror
6571  const int mirror_offset = in_bytes(Klass::java_mirror_offset());
6572  movptr(mirror, Address(method, Method::const_offset()));
6573  movptr(mirror, Address(mirror, ConstMethod::constants_offset()));
6574  movptr(mirror, Address(mirror, ConstantPool::pool_holder_offset_in_bytes()));
6575  movptr(mirror, Address(mirror, mirror_offset));
6576}
6577
6578void MacroAssembler::load_klass(Register dst, Register src) {
6579#ifdef _LP64
6580  if (UseCompressedClassPointers) {
6581    movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
6582    decode_klass_not_null(dst);
6583  } else
6584#endif
6585    movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
6586}
6587
6588void MacroAssembler::load_prototype_header(Register dst, Register src) {
6589  load_klass(dst, src);
6590  movptr(dst, Address(dst, Klass::prototype_header_offset()));
6591}
6592
6593void MacroAssembler::store_klass(Register dst, Register src) {
6594#ifdef _LP64
6595  if (UseCompressedClassPointers) {
6596    encode_klass_not_null(src);
6597    movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
6598  } else
6599#endif
6600    movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
6601}
6602
6603void MacroAssembler::load_heap_oop(Register dst, Address src) {
6604#ifdef _LP64
6605  // FIXME: Must change all places where we try to load the klass.
6606  if (UseCompressedOops) {
6607    movl(dst, src);
6608    decode_heap_oop(dst);
6609  } else
6610#endif
6611    movptr(dst, src);
6612}
6613
6614// Doesn't do verfication, generates fixed size code
6615void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
6616#ifdef _LP64
6617  if (UseCompressedOops) {
6618    movl(dst, src);
6619    decode_heap_oop_not_null(dst);
6620  } else
6621#endif
6622    movptr(dst, src);
6623}
6624
6625void MacroAssembler::store_heap_oop(Address dst, Register src) {
6626#ifdef _LP64
6627  if (UseCompressedOops) {
6628    assert(!dst.uses(src), "not enough registers");
6629    encode_heap_oop(src);
6630    movl(dst, src);
6631  } else
6632#endif
6633    movptr(dst, src);
6634}
6635
6636void MacroAssembler::cmp_heap_oop(Register src1, Address src2, Register tmp) {
6637  assert_different_registers(src1, tmp);
6638#ifdef _LP64
6639  if (UseCompressedOops) {
6640    bool did_push = false;
6641    if (tmp == noreg) {
6642      tmp = rax;
6643      push(tmp);
6644      did_push = true;
6645      assert(!src2.uses(rsp), "can't push");
6646    }
6647    load_heap_oop(tmp, src2);
6648    cmpptr(src1, tmp);
6649    if (did_push)  pop(tmp);
6650  } else
6651#endif
6652    cmpptr(src1, src2);
6653}
6654
6655// Used for storing NULLs.
6656void MacroAssembler::store_heap_oop_null(Address dst) {
6657#ifdef _LP64
6658  if (UseCompressedOops) {
6659    movl(dst, (int32_t)NULL_WORD);
6660  } else {
6661    movslq(dst, (int32_t)NULL_WORD);
6662  }
6663#else
6664  movl(dst, (int32_t)NULL_WORD);
6665#endif
6666}
6667
6668#ifdef _LP64
6669void MacroAssembler::store_klass_gap(Register dst, Register src) {
6670  if (UseCompressedClassPointers) {
6671    // Store to klass gap in destination
6672    movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
6673  }
6674}
6675
6676#ifdef ASSERT
6677void MacroAssembler::verify_heapbase(const char* msg) {
6678  assert (UseCompressedOops, "should be compressed");
6679  assert (Universe::heap() != NULL, "java heap should be initialized");
6680  if (CheckCompressedOops) {
6681    Label ok;
6682    push(rscratch1); // cmpptr trashes rscratch1
6683    cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
6684    jcc(Assembler::equal, ok);
6685    STOP(msg);
6686    bind(ok);
6687    pop(rscratch1);
6688  }
6689}
6690#endif
6691
6692// Algorithm must match oop.inline.hpp encode_heap_oop.
6693void MacroAssembler::encode_heap_oop(Register r) {
6694#ifdef ASSERT
6695  verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
6696#endif
6697  verify_oop(r, "broken oop in encode_heap_oop");
6698  if (Universe::narrow_oop_base() == NULL) {
6699    if (Universe::narrow_oop_shift() != 0) {
6700      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6701      shrq(r, LogMinObjAlignmentInBytes);
6702    }
6703    return;
6704  }
6705  testq(r, r);
6706  cmovq(Assembler::equal, r, r12_heapbase);
6707  subq(r, r12_heapbase);
6708  shrq(r, LogMinObjAlignmentInBytes);
6709}
6710
6711void MacroAssembler::encode_heap_oop_not_null(Register r) {
6712#ifdef ASSERT
6713  verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
6714  if (CheckCompressedOops) {
6715    Label ok;
6716    testq(r, r);
6717    jcc(Assembler::notEqual, ok);
6718    STOP("null oop passed to encode_heap_oop_not_null");
6719    bind(ok);
6720  }
6721#endif
6722  verify_oop(r, "broken oop in encode_heap_oop_not_null");
6723  if (Universe::narrow_oop_base() != NULL) {
6724    subq(r, r12_heapbase);
6725  }
6726  if (Universe::narrow_oop_shift() != 0) {
6727    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6728    shrq(r, LogMinObjAlignmentInBytes);
6729  }
6730}
6731
6732void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
6733#ifdef ASSERT
6734  verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
6735  if (CheckCompressedOops) {
6736    Label ok;
6737    testq(src, src);
6738    jcc(Assembler::notEqual, ok);
6739    STOP("null oop passed to encode_heap_oop_not_null2");
6740    bind(ok);
6741  }
6742#endif
6743  verify_oop(src, "broken oop in encode_heap_oop_not_null2");
6744  if (dst != src) {
6745    movq(dst, src);
6746  }
6747  if (Universe::narrow_oop_base() != NULL) {
6748    subq(dst, r12_heapbase);
6749  }
6750  if (Universe::narrow_oop_shift() != 0) {
6751    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6752    shrq(dst, LogMinObjAlignmentInBytes);
6753  }
6754}
6755
6756void  MacroAssembler::decode_heap_oop(Register r) {
6757#ifdef ASSERT
6758  verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
6759#endif
6760  if (Universe::narrow_oop_base() == NULL) {
6761    if (Universe::narrow_oop_shift() != 0) {
6762      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6763      shlq(r, LogMinObjAlignmentInBytes);
6764    }
6765  } else {
6766    Label done;
6767    shlq(r, LogMinObjAlignmentInBytes);
6768    jccb(Assembler::equal, done);
6769    addq(r, r12_heapbase);
6770    bind(done);
6771  }
6772  verify_oop(r, "broken oop in decode_heap_oop");
6773}
6774
6775void  MacroAssembler::decode_heap_oop_not_null(Register r) {
6776  // Note: it will change flags
6777  assert (UseCompressedOops, "should only be used for compressed headers");
6778  assert (Universe::heap() != NULL, "java heap should be initialized");
6779  // Cannot assert, unverified entry point counts instructions (see .ad file)
6780  // vtableStubs also counts instructions in pd_code_size_limit.
6781  // Also do not verify_oop as this is called by verify_oop.
6782  if (Universe::narrow_oop_shift() != 0) {
6783    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6784    shlq(r, LogMinObjAlignmentInBytes);
6785    if (Universe::narrow_oop_base() != NULL) {
6786      addq(r, r12_heapbase);
6787    }
6788  } else {
6789    assert (Universe::narrow_oop_base() == NULL, "sanity");
6790  }
6791}
6792
6793void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
6794  // Note: it will change flags
6795  assert (UseCompressedOops, "should only be used for compressed headers");
6796  assert (Universe::heap() != NULL, "java heap should be initialized");
6797  // Cannot assert, unverified entry point counts instructions (see .ad file)
6798  // vtableStubs also counts instructions in pd_code_size_limit.
6799  // Also do not verify_oop as this is called by verify_oop.
6800  if (Universe::narrow_oop_shift() != 0) {
6801    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6802    if (LogMinObjAlignmentInBytes == Address::times_8) {
6803      leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
6804    } else {
6805      if (dst != src) {
6806        movq(dst, src);
6807      }
6808      shlq(dst, LogMinObjAlignmentInBytes);
6809      if (Universe::narrow_oop_base() != NULL) {
6810        addq(dst, r12_heapbase);
6811      }
6812    }
6813  } else {
6814    assert (Universe::narrow_oop_base() == NULL, "sanity");
6815    if (dst != src) {
6816      movq(dst, src);
6817    }
6818  }
6819}
6820
6821void MacroAssembler::encode_klass_not_null(Register r) {
6822  if (Universe::narrow_klass_base() != NULL) {
6823    // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
6824    assert(r != r12_heapbase, "Encoding a klass in r12");
6825    mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
6826    subq(r, r12_heapbase);
6827  }
6828  if (Universe::narrow_klass_shift() != 0) {
6829    assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
6830    shrq(r, LogKlassAlignmentInBytes);
6831  }
6832  if (Universe::narrow_klass_base() != NULL) {
6833    reinit_heapbase();
6834  }
6835}
6836
6837void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
6838  if (dst == src) {
6839    encode_klass_not_null(src);
6840  } else {
6841    if (Universe::narrow_klass_base() != NULL) {
6842      mov64(dst, (int64_t)Universe::narrow_klass_base());
6843      negq(dst);
6844      addq(dst, src);
6845    } else {
6846      movptr(dst, src);
6847    }
6848    if (Universe::narrow_klass_shift() != 0) {
6849      assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
6850      shrq(dst, LogKlassAlignmentInBytes);
6851    }
6852  }
6853}
6854
6855// Function instr_size_for_decode_klass_not_null() counts the instructions
6856// generated by decode_klass_not_null(register r) and reinit_heapbase(),
6857// when (Universe::heap() != NULL).  Hence, if the instructions they
6858// generate change, then this method needs to be updated.
6859int MacroAssembler::instr_size_for_decode_klass_not_null() {
6860  assert (UseCompressedClassPointers, "only for compressed klass ptrs");
6861  if (Universe::narrow_klass_base() != NULL) {
6862    // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
6863    return (Universe::narrow_klass_shift() == 0 ? 20 : 24);
6864  } else {
6865    // longest load decode klass function, mov64, leaq
6866    return 16;
6867  }
6868}
6869
6870// !!! If the instructions that get generated here change then function
6871// instr_size_for_decode_klass_not_null() needs to get updated.
6872void  MacroAssembler::decode_klass_not_null(Register r) {
6873  // Note: it will change flags
6874  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6875  assert(r != r12_heapbase, "Decoding a klass in r12");
6876  // Cannot assert, unverified entry point counts instructions (see .ad file)
6877  // vtableStubs also counts instructions in pd_code_size_limit.
6878  // Also do not verify_oop as this is called by verify_oop.
6879  if (Universe::narrow_klass_shift() != 0) {
6880    assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
6881    shlq(r, LogKlassAlignmentInBytes);
6882  }
6883  // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
6884  if (Universe::narrow_klass_base() != NULL) {
6885    mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
6886    addq(r, r12_heapbase);
6887    reinit_heapbase();
6888  }
6889}
6890
6891void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
6892  // Note: it will change flags
6893  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6894  if (dst == src) {
6895    decode_klass_not_null(dst);
6896  } else {
6897    // Cannot assert, unverified entry point counts instructions (see .ad file)
6898    // vtableStubs also counts instructions in pd_code_size_limit.
6899    // Also do not verify_oop as this is called by verify_oop.
6900    mov64(dst, (int64_t)Universe::narrow_klass_base());
6901    if (Universe::narrow_klass_shift() != 0) {
6902      assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
6903      assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
6904      leaq(dst, Address(dst, src, Address::times_8, 0));
6905    } else {
6906      addq(dst, src);
6907    }
6908  }
6909}
6910
6911void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
6912  assert (UseCompressedOops, "should only be used for compressed headers");
6913  assert (Universe::heap() != NULL, "java heap should be initialized");
6914  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6915  int oop_index = oop_recorder()->find_index(obj);
6916  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6917  mov_narrow_oop(dst, oop_index, rspec);
6918}
6919
6920void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
6921  assert (UseCompressedOops, "should only be used for compressed headers");
6922  assert (Universe::heap() != NULL, "java heap should be initialized");
6923  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6924  int oop_index = oop_recorder()->find_index(obj);
6925  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6926  mov_narrow_oop(dst, oop_index, rspec);
6927}
6928
6929void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
6930  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6931  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6932  int klass_index = oop_recorder()->find_index(k);
6933  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6934  mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
6935}
6936
6937void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
6938  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6939  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6940  int klass_index = oop_recorder()->find_index(k);
6941  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6942  mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
6943}
6944
6945void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
6946  assert (UseCompressedOops, "should only be used for compressed headers");
6947  assert (Universe::heap() != NULL, "java heap should be initialized");
6948  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6949  int oop_index = oop_recorder()->find_index(obj);
6950  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6951  Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6952}
6953
6954void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
6955  assert (UseCompressedOops, "should only be used for compressed headers");
6956  assert (Universe::heap() != NULL, "java heap should be initialized");
6957  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6958  int oop_index = oop_recorder()->find_index(obj);
6959  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6960  Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6961}
6962
6963void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
6964  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6965  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6966  int klass_index = oop_recorder()->find_index(k);
6967  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6968  Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
6969}
6970
6971void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
6972  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6973  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6974  int klass_index = oop_recorder()->find_index(k);
6975  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6976  Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
6977}
6978
6979void MacroAssembler::reinit_heapbase() {
6980  if (UseCompressedOops || UseCompressedClassPointers) {
6981    if (Universe::heap() != NULL) {
6982      if (Universe::narrow_oop_base() == NULL) {
6983        MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
6984      } else {
6985        mov64(r12_heapbase, (int64_t)Universe::narrow_ptrs_base());
6986      }
6987    } else {
6988      movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
6989    }
6990  }
6991}
6992
6993#endif // _LP64
6994
6995
6996// C2 compiled method's prolog code.
6997void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b) {
6998
6999  // WARNING: Initial instruction MUST be 5 bytes or longer so that
7000  // NativeJump::patch_verified_entry will be able to patch out the entry
7001  // code safely. The push to verify stack depth is ok at 5 bytes,
7002  // the frame allocation can be either 3 or 6 bytes. So if we don't do
7003  // stack bang then we must use the 6 byte frame allocation even if
7004  // we have no frame. :-(
7005  assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
7006
7007  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
7008  // Remove word for return addr
7009  framesize -= wordSize;
7010  stack_bang_size -= wordSize;
7011
7012  // Calls to C2R adapters often do not accept exceptional returns.
7013  // We require that their callers must bang for them.  But be careful, because
7014  // some VM calls (such as call site linkage) can use several kilobytes of
7015  // stack.  But the stack safety zone should account for that.
7016  // See bugs 4446381, 4468289, 4497237.
7017  if (stack_bang_size > 0) {
7018    generate_stack_overflow_check(stack_bang_size);
7019
7020    // We always push rbp, so that on return to interpreter rbp, will be
7021    // restored correctly and we can correct the stack.
7022    push(rbp);
7023    // Save caller's stack pointer into RBP if the frame pointer is preserved.
7024    if (PreserveFramePointer) {
7025      mov(rbp, rsp);
7026    }
7027    // Remove word for ebp
7028    framesize -= wordSize;
7029
7030    // Create frame
7031    if (framesize) {
7032      subptr(rsp, framesize);
7033    }
7034  } else {
7035    // Create frame (force generation of a 4 byte immediate value)
7036    subptr_imm32(rsp, framesize);
7037
7038    // Save RBP register now.
7039    framesize -= wordSize;
7040    movptr(Address(rsp, framesize), rbp);
7041    // Save caller's stack pointer into RBP if the frame pointer is preserved.
7042    if (PreserveFramePointer) {
7043      movptr(rbp, rsp);
7044      if (framesize > 0) {
7045        addptr(rbp, framesize);
7046      }
7047    }
7048  }
7049
7050  if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
7051    framesize -= wordSize;
7052    movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
7053  }
7054
7055#ifndef _LP64
7056  // If method sets FPU control word do it now
7057  if (fp_mode_24b) {
7058    fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
7059  }
7060  if (UseSSE >= 2 && VerifyFPU) {
7061    verify_FPU(0, "FPU stack must be clean on entry");
7062  }
7063#endif
7064
7065#ifdef ASSERT
7066  if (VerifyStackAtCalls) {
7067    Label L;
7068    push(rax);
7069    mov(rax, rsp);
7070    andptr(rax, StackAlignmentInBytes-1);
7071    cmpptr(rax, StackAlignmentInBytes-wordSize);
7072    pop(rax);
7073    jcc(Assembler::equal, L);
7074    STOP("Stack is not properly aligned!");
7075    bind(L);
7076  }
7077#endif
7078
7079}
7080
7081void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, bool is_large) {
7082  // cnt - number of qwords (8-byte words).
7083  // base - start address, qword aligned.
7084  // is_large - if optimizers know cnt is larger than InitArrayShortSize
7085  assert(base==rdi, "base register must be edi for rep stos");
7086  assert(tmp==rax,   "tmp register must be eax for rep stos");
7087  assert(cnt==rcx,   "cnt register must be ecx for rep stos");
7088  assert(InitArrayShortSize % BytesPerLong == 0,
7089    "InitArrayShortSize should be the multiple of BytesPerLong");
7090
7091  Label DONE;
7092
7093  xorptr(tmp, tmp);
7094
7095  if (!is_large) {
7096    Label LOOP, LONG;
7097    cmpptr(cnt, InitArrayShortSize/BytesPerLong);
7098    jccb(Assembler::greater, LONG);
7099
7100    NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
7101
7102    decrement(cnt);
7103    jccb(Assembler::negative, DONE); // Zero length
7104
7105    // Use individual pointer-sized stores for small counts:
7106    BIND(LOOP);
7107    movptr(Address(base, cnt, Address::times_ptr), tmp);
7108    decrement(cnt);
7109    jccb(Assembler::greaterEqual, LOOP);
7110    jmpb(DONE);
7111
7112    BIND(LONG);
7113  }
7114
7115  // Use longer rep-prefixed ops for non-small counts:
7116  if (UseFastStosb) {
7117    shlptr(cnt, 3); // convert to number of bytes
7118    rep_stosb();
7119  } else {
7120    NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
7121    rep_stos();
7122  }
7123
7124  BIND(DONE);
7125}
7126
7127#ifdef COMPILER2
7128
7129// IndexOf for constant substrings with size >= 8 chars
7130// which don't need to be loaded through stack.
7131void MacroAssembler::string_indexofC8(Register str1, Register str2,
7132                                      Register cnt1, Register cnt2,
7133                                      int int_cnt2,  Register result,
7134                                      XMMRegister vec, Register tmp,
7135                                      int ae) {
7136  ShortBranchVerifier sbv(this);
7137  assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
7138  assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
7139
7140  // This method uses the pcmpestri instruction with bound registers
7141  //   inputs:
7142  //     xmm - substring
7143  //     rax - substring length (elements count)
7144  //     mem - scanned string
7145  //     rdx - string length (elements count)
7146  //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
7147  //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
7148  //   outputs:
7149  //     rcx - matched index in string
7150  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
7151  int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
7152  int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
7153  Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
7154  Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
7155
7156  Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
7157        RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
7158        MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
7159
7160  // Note, inline_string_indexOf() generates checks:
7161  // if (substr.count > string.count) return -1;
7162  // if (substr.count == 0) return 0;
7163  assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
7164
7165  // Load substring.
7166  if (ae == StrIntrinsicNode::UL) {
7167    pmovzxbw(vec, Address(str2, 0));
7168  } else {
7169    movdqu(vec, Address(str2, 0));
7170  }
7171  movl(cnt2, int_cnt2);
7172  movptr(result, str1); // string addr
7173
7174  if (int_cnt2 > stride) {
7175    jmpb(SCAN_TO_SUBSTR);
7176
7177    // Reload substr for rescan, this code
7178    // is executed only for large substrings (> 8 chars)
7179    bind(RELOAD_SUBSTR);
7180    if (ae == StrIntrinsicNode::UL) {
7181      pmovzxbw(vec, Address(str2, 0));
7182    } else {
7183      movdqu(vec, Address(str2, 0));
7184    }
7185    negptr(cnt2); // Jumped here with negative cnt2, convert to positive
7186
7187    bind(RELOAD_STR);
7188    // We came here after the beginning of the substring was
7189    // matched but the rest of it was not so we need to search
7190    // again. Start from the next element after the previous match.
7191
7192    // cnt2 is number of substring reminding elements and
7193    // cnt1 is number of string reminding elements when cmp failed.
7194    // Restored cnt1 = cnt1 - cnt2 + int_cnt2
7195    subl(cnt1, cnt2);
7196    addl(cnt1, int_cnt2);
7197    movl(cnt2, int_cnt2); // Now restore cnt2
7198
7199    decrementl(cnt1);     // Shift to next element
7200    cmpl(cnt1, cnt2);
7201    jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
7202
7203    addptr(result, (1<<scale1));
7204
7205  } // (int_cnt2 > 8)
7206
7207  // Scan string for start of substr in 16-byte vectors
7208  bind(SCAN_TO_SUBSTR);
7209  pcmpestri(vec, Address(result, 0), mode);
7210  jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
7211  subl(cnt1, stride);
7212  jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
7213  cmpl(cnt1, cnt2);
7214  jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
7215  addptr(result, 16);
7216  jmpb(SCAN_TO_SUBSTR);
7217
7218  // Found a potential substr
7219  bind(FOUND_CANDIDATE);
7220  // Matched whole vector if first element matched (tmp(rcx) == 0).
7221  if (int_cnt2 == stride) {
7222    jccb(Assembler::overflow, RET_FOUND);    // OF == 1
7223  } else { // int_cnt2 > 8
7224    jccb(Assembler::overflow, FOUND_SUBSTR);
7225  }
7226  // After pcmpestri tmp(rcx) contains matched element index
7227  // Compute start addr of substr
7228  lea(result, Address(result, tmp, scale1));
7229
7230  // Make sure string is still long enough
7231  subl(cnt1, tmp);
7232  cmpl(cnt1, cnt2);
7233  if (int_cnt2 == stride) {
7234    jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
7235  } else { // int_cnt2 > 8
7236    jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
7237  }
7238  // Left less then substring.
7239
7240  bind(RET_NOT_FOUND);
7241  movl(result, -1);
7242  jmp(EXIT);
7243
7244  if (int_cnt2 > stride) {
7245    // This code is optimized for the case when whole substring
7246    // is matched if its head is matched.
7247    bind(MATCH_SUBSTR_HEAD);
7248    pcmpestri(vec, Address(result, 0), mode);
7249    // Reload only string if does not match
7250    jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
7251
7252    Label CONT_SCAN_SUBSTR;
7253    // Compare the rest of substring (> 8 chars).
7254    bind(FOUND_SUBSTR);
7255    // First 8 chars are already matched.
7256    negptr(cnt2);
7257    addptr(cnt2, stride);
7258
7259    bind(SCAN_SUBSTR);
7260    subl(cnt1, stride);
7261    cmpl(cnt2, -stride); // Do not read beyond substring
7262    jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
7263    // Back-up strings to avoid reading beyond substring:
7264    // cnt1 = cnt1 - cnt2 + 8
7265    addl(cnt1, cnt2); // cnt2 is negative
7266    addl(cnt1, stride);
7267    movl(cnt2, stride); negptr(cnt2);
7268    bind(CONT_SCAN_SUBSTR);
7269    if (int_cnt2 < (int)G) {
7270      int tail_off1 = int_cnt2<<scale1;
7271      int tail_off2 = int_cnt2<<scale2;
7272      if (ae == StrIntrinsicNode::UL) {
7273        pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
7274      } else {
7275        movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
7276      }
7277      pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
7278    } else {
7279      // calculate index in register to avoid integer overflow (int_cnt2*2)
7280      movl(tmp, int_cnt2);
7281      addptr(tmp, cnt2);
7282      if (ae == StrIntrinsicNode::UL) {
7283        pmovzxbw(vec, Address(str2, tmp, scale2, 0));
7284      } else {
7285        movdqu(vec, Address(str2, tmp, scale2, 0));
7286      }
7287      pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
7288    }
7289    // Need to reload strings pointers if not matched whole vector
7290    jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
7291    addptr(cnt2, stride);
7292    jcc(Assembler::negative, SCAN_SUBSTR);
7293    // Fall through if found full substring
7294
7295  } // (int_cnt2 > 8)
7296
7297  bind(RET_FOUND);
7298  // Found result if we matched full small substring.
7299  // Compute substr offset
7300  subptr(result, str1);
7301  if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
7302    shrl(result, 1); // index
7303  }
7304  bind(EXIT);
7305
7306} // string_indexofC8
7307
7308// Small strings are loaded through stack if they cross page boundary.
7309void MacroAssembler::string_indexof(Register str1, Register str2,
7310                                    Register cnt1, Register cnt2,
7311                                    int int_cnt2,  Register result,
7312                                    XMMRegister vec, Register tmp,
7313                                    int ae) {
7314  ShortBranchVerifier sbv(this);
7315  assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
7316  assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
7317
7318  //
7319  // int_cnt2 is length of small (< 8 chars) constant substring
7320  // or (-1) for non constant substring in which case its length
7321  // is in cnt2 register.
7322  //
7323  // Note, inline_string_indexOf() generates checks:
7324  // if (substr.count > string.count) return -1;
7325  // if (substr.count == 0) return 0;
7326  //
7327  int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
7328  assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
7329  // This method uses the pcmpestri instruction with bound registers
7330  //   inputs:
7331  //     xmm - substring
7332  //     rax - substring length (elements count)
7333  //     mem - scanned string
7334  //     rdx - string length (elements count)
7335  //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
7336  //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
7337  //   outputs:
7338  //     rcx - matched index in string
7339  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
7340  int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
7341  Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
7342  Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
7343
7344  Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
7345        RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
7346        FOUND_CANDIDATE;
7347
7348  { //========================================================
7349    // We don't know where these strings are located
7350    // and we can't read beyond them. Load them through stack.
7351    Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
7352
7353    movptr(tmp, rsp); // save old SP
7354
7355    if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
7356      if (int_cnt2 == (1>>scale2)) { // One byte
7357        assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
7358        load_unsigned_byte(result, Address(str2, 0));
7359        movdl(vec, result); // move 32 bits
7360      } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
7361        // Not enough header space in 32-bit VM: 12+3 = 15.
7362        movl(result, Address(str2, -1));
7363        shrl(result, 8);
7364        movdl(vec, result); // move 32 bits
7365      } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
7366        load_unsigned_short(result, Address(str2, 0));
7367        movdl(vec, result); // move 32 bits
7368      } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
7369        movdl(vec, Address(str2, 0)); // move 32 bits
7370      } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
7371        movq(vec, Address(str2, 0));  // move 64 bits
7372      } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
7373        // Array header size is 12 bytes in 32-bit VM
7374        // + 6 bytes for 3 chars == 18 bytes,
7375        // enough space to load vec and shift.
7376        assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
7377        if (ae == StrIntrinsicNode::UL) {
7378          int tail_off = int_cnt2-8;
7379          pmovzxbw(vec, Address(str2, tail_off));
7380          psrldq(vec, -2*tail_off);
7381        }
7382        else {
7383          int tail_off = int_cnt2*(1<<scale2);
7384          movdqu(vec, Address(str2, tail_off-16));
7385          psrldq(vec, 16-tail_off);
7386        }
7387      }
7388    } else { // not constant substring
7389      cmpl(cnt2, stride);
7390      jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
7391
7392      // We can read beyond string if srt+16 does not cross page boundary
7393      // since heaps are aligned and mapped by pages.
7394      assert(os::vm_page_size() < (int)G, "default page should be small");
7395      movl(result, str2); // We need only low 32 bits
7396      andl(result, (os::vm_page_size()-1));
7397      cmpl(result, (os::vm_page_size()-16));
7398      jccb(Assembler::belowEqual, CHECK_STR);
7399
7400      // Move small strings to stack to allow load 16 bytes into vec.
7401      subptr(rsp, 16);
7402      int stk_offset = wordSize-(1<<scale2);
7403      push(cnt2);
7404
7405      bind(COPY_SUBSTR);
7406      if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
7407        load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
7408        movb(Address(rsp, cnt2, scale2, stk_offset), result);
7409      } else if (ae == StrIntrinsicNode::UU) {
7410        load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
7411        movw(Address(rsp, cnt2, scale2, stk_offset), result);
7412      }
7413      decrement(cnt2);
7414      jccb(Assembler::notZero, COPY_SUBSTR);
7415
7416      pop(cnt2);
7417      movptr(str2, rsp);  // New substring address
7418    } // non constant
7419
7420    bind(CHECK_STR);
7421    cmpl(cnt1, stride);
7422    jccb(Assembler::aboveEqual, BIG_STRINGS);
7423
7424    // Check cross page boundary.
7425    movl(result, str1); // We need only low 32 bits
7426    andl(result, (os::vm_page_size()-1));
7427    cmpl(result, (os::vm_page_size()-16));
7428    jccb(Assembler::belowEqual, BIG_STRINGS);
7429
7430    subptr(rsp, 16);
7431    int stk_offset = -(1<<scale1);
7432    if (int_cnt2 < 0) { // not constant
7433      push(cnt2);
7434      stk_offset += wordSize;
7435    }
7436    movl(cnt2, cnt1);
7437
7438    bind(COPY_STR);
7439    if (ae == StrIntrinsicNode::LL) {
7440      load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
7441      movb(Address(rsp, cnt2, scale1, stk_offset), result);
7442    } else {
7443      load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
7444      movw(Address(rsp, cnt2, scale1, stk_offset), result);
7445    }
7446    decrement(cnt2);
7447    jccb(Assembler::notZero, COPY_STR);
7448
7449    if (int_cnt2 < 0) { // not constant
7450      pop(cnt2);
7451    }
7452    movptr(str1, rsp);  // New string address
7453
7454    bind(BIG_STRINGS);
7455    // Load substring.
7456    if (int_cnt2 < 0) { // -1
7457      if (ae == StrIntrinsicNode::UL) {
7458        pmovzxbw(vec, Address(str2, 0));
7459      } else {
7460        movdqu(vec, Address(str2, 0));
7461      }
7462      push(cnt2);       // substr count
7463      push(str2);       // substr addr
7464      push(str1);       // string addr
7465    } else {
7466      // Small (< 8 chars) constant substrings are loaded already.
7467      movl(cnt2, int_cnt2);
7468    }
7469    push(tmp);  // original SP
7470
7471  } // Finished loading
7472
7473  //========================================================
7474  // Start search
7475  //
7476
7477  movptr(result, str1); // string addr
7478
7479  if (int_cnt2  < 0) {  // Only for non constant substring
7480    jmpb(SCAN_TO_SUBSTR);
7481
7482    // SP saved at sp+0
7483    // String saved at sp+1*wordSize
7484    // Substr saved at sp+2*wordSize
7485    // Substr count saved at sp+3*wordSize
7486
7487    // Reload substr for rescan, this code
7488    // is executed only for large substrings (> 8 chars)
7489    bind(RELOAD_SUBSTR);
7490    movptr(str2, Address(rsp, 2*wordSize));
7491    movl(cnt2, Address(rsp, 3*wordSize));
7492    if (ae == StrIntrinsicNode::UL) {
7493      pmovzxbw(vec, Address(str2, 0));
7494    } else {
7495      movdqu(vec, Address(str2, 0));
7496    }
7497    // We came here after the beginning of the substring was
7498    // matched but the rest of it was not so we need to search
7499    // again. Start from the next element after the previous match.
7500    subptr(str1, result); // Restore counter
7501    if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
7502      shrl(str1, 1);
7503    }
7504    addl(cnt1, str1);
7505    decrementl(cnt1);   // Shift to next element
7506    cmpl(cnt1, cnt2);
7507    jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
7508
7509    addptr(result, (1<<scale1));
7510  } // non constant
7511
7512  // Scan string for start of substr in 16-byte vectors
7513  bind(SCAN_TO_SUBSTR);
7514  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
7515  pcmpestri(vec, Address(result, 0), mode);
7516  jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
7517  subl(cnt1, stride);
7518  jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
7519  cmpl(cnt1, cnt2);
7520  jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
7521  addptr(result, 16);
7522
7523  bind(ADJUST_STR);
7524  cmpl(cnt1, stride); // Do not read beyond string
7525  jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
7526  // Back-up string to avoid reading beyond string.
7527  lea(result, Address(result, cnt1, scale1, -16));
7528  movl(cnt1, stride);
7529  jmpb(SCAN_TO_SUBSTR);
7530
7531  // Found a potential substr
7532  bind(FOUND_CANDIDATE);
7533  // After pcmpestri tmp(rcx) contains matched element index
7534
7535  // Make sure string is still long enough
7536  subl(cnt1, tmp);
7537  cmpl(cnt1, cnt2);
7538  jccb(Assembler::greaterEqual, FOUND_SUBSTR);
7539  // Left less then substring.
7540
7541  bind(RET_NOT_FOUND);
7542  movl(result, -1);
7543  jmpb(CLEANUP);
7544
7545  bind(FOUND_SUBSTR);
7546  // Compute start addr of substr
7547  lea(result, Address(result, tmp, scale1));
7548  if (int_cnt2 > 0) { // Constant substring
7549    // Repeat search for small substring (< 8 chars)
7550    // from new point without reloading substring.
7551    // Have to check that we don't read beyond string.
7552    cmpl(tmp, stride-int_cnt2);
7553    jccb(Assembler::greater, ADJUST_STR);
7554    // Fall through if matched whole substring.
7555  } else { // non constant
7556    assert(int_cnt2 == -1, "should be != 0");
7557
7558    addl(tmp, cnt2);
7559    // Found result if we matched whole substring.
7560    cmpl(tmp, stride);
7561    jccb(Assembler::lessEqual, RET_FOUND);
7562
7563    // Repeat search for small substring (<= 8 chars)
7564    // from new point 'str1' without reloading substring.
7565    cmpl(cnt2, stride);
7566    // Have to check that we don't read beyond string.
7567    jccb(Assembler::lessEqual, ADJUST_STR);
7568
7569    Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
7570    // Compare the rest of substring (> 8 chars).
7571    movptr(str1, result);
7572
7573    cmpl(tmp, cnt2);
7574    // First 8 chars are already matched.
7575    jccb(Assembler::equal, CHECK_NEXT);
7576
7577    bind(SCAN_SUBSTR);
7578    pcmpestri(vec, Address(str1, 0), mode);
7579    // Need to reload strings pointers if not matched whole vector
7580    jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
7581
7582    bind(CHECK_NEXT);
7583    subl(cnt2, stride);
7584    jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
7585    addptr(str1, 16);
7586    if (ae == StrIntrinsicNode::UL) {
7587      addptr(str2, 8);
7588    } else {
7589      addptr(str2, 16);
7590    }
7591    subl(cnt1, stride);
7592    cmpl(cnt2, stride); // Do not read beyond substring
7593    jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
7594    // Back-up strings to avoid reading beyond substring.
7595
7596    if (ae == StrIntrinsicNode::UL) {
7597      lea(str2, Address(str2, cnt2, scale2, -8));
7598      lea(str1, Address(str1, cnt2, scale1, -16));
7599    } else {
7600      lea(str2, Address(str2, cnt2, scale2, -16));
7601      lea(str1, Address(str1, cnt2, scale1, -16));
7602    }
7603    subl(cnt1, cnt2);
7604    movl(cnt2, stride);
7605    addl(cnt1, stride);
7606    bind(CONT_SCAN_SUBSTR);
7607    if (ae == StrIntrinsicNode::UL) {
7608      pmovzxbw(vec, Address(str2, 0));
7609    } else {
7610      movdqu(vec, Address(str2, 0));
7611    }
7612    jmp(SCAN_SUBSTR);
7613
7614    bind(RET_FOUND_LONG);
7615    movptr(str1, Address(rsp, wordSize));
7616  } // non constant
7617
7618  bind(RET_FOUND);
7619  // Compute substr offset
7620  subptr(result, str1);
7621  if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
7622    shrl(result, 1); // index
7623  }
7624  bind(CLEANUP);
7625  pop(rsp); // restore SP
7626
7627} // string_indexof
7628
7629void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
7630                                         XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
7631  ShortBranchVerifier sbv(this);
7632  assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
7633
7634  int stride = 8;
7635
7636  Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
7637        SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
7638        RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
7639        FOUND_SEQ_CHAR, DONE_LABEL;
7640
7641  movptr(result, str1);
7642  if (UseAVX >= 2) {
7643    cmpl(cnt1, stride);
7644    jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
7645    cmpl(cnt1, 2*stride);
7646    jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
7647    movdl(vec1, ch);
7648    vpbroadcastw(vec1, vec1);
7649    vpxor(vec2, vec2);
7650    movl(tmp, cnt1);
7651    andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
7652    andl(cnt1,0x0000000F);  //tail count (in chars)
7653
7654    bind(SCAN_TO_16_CHAR_LOOP);
7655    vmovdqu(vec3, Address(result, 0));
7656    vpcmpeqw(vec3, vec3, vec1, 1);
7657    vptest(vec2, vec3);
7658    jcc(Assembler::carryClear, FOUND_CHAR);
7659    addptr(result, 32);
7660    subl(tmp, 2*stride);
7661    jccb(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
7662    jmp(SCAN_TO_8_CHAR);
7663    bind(SCAN_TO_8_CHAR_INIT);
7664    movdl(vec1, ch);
7665    pshuflw(vec1, vec1, 0x00);
7666    pshufd(vec1, vec1, 0);
7667    pxor(vec2, vec2);
7668  }
7669  bind(SCAN_TO_8_CHAR);
7670  cmpl(cnt1, stride);
7671  if (UseAVX >= 2) {
7672    jcc(Assembler::less, SCAN_TO_CHAR);
7673  } else {
7674    jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
7675    movdl(vec1, ch);
7676    pshuflw(vec1, vec1, 0x00);
7677    pshufd(vec1, vec1, 0);
7678    pxor(vec2, vec2);
7679  }
7680  movl(tmp, cnt1);
7681  andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
7682  andl(cnt1,0x00000007);  //tail count (in chars)
7683
7684  bind(SCAN_TO_8_CHAR_LOOP);
7685  movdqu(vec3, Address(result, 0));
7686  pcmpeqw(vec3, vec1);
7687  ptest(vec2, vec3);
7688  jcc(Assembler::carryClear, FOUND_CHAR);
7689  addptr(result, 16);
7690  subl(tmp, stride);
7691  jccb(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
7692  bind(SCAN_TO_CHAR);
7693  testl(cnt1, cnt1);
7694  jcc(Assembler::zero, RET_NOT_FOUND);
7695  bind(SCAN_TO_CHAR_LOOP);
7696  load_unsigned_short(tmp, Address(result, 0));
7697  cmpl(ch, tmp);
7698  jccb(Assembler::equal, FOUND_SEQ_CHAR);
7699  addptr(result, 2);
7700  subl(cnt1, 1);
7701  jccb(Assembler::zero, RET_NOT_FOUND);
7702  jmp(SCAN_TO_CHAR_LOOP);
7703
7704  bind(RET_NOT_FOUND);
7705  movl(result, -1);
7706  jmpb(DONE_LABEL);
7707
7708  bind(FOUND_CHAR);
7709  if (UseAVX >= 2) {
7710    vpmovmskb(tmp, vec3);
7711  } else {
7712    pmovmskb(tmp, vec3);
7713  }
7714  bsfl(ch, tmp);
7715  addl(result, ch);
7716
7717  bind(FOUND_SEQ_CHAR);
7718  subptr(result, str1);
7719  shrl(result, 1);
7720
7721  bind(DONE_LABEL);
7722} // string_indexof_char
7723
7724// helper function for string_compare
7725void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
7726                                        Address::ScaleFactor scale, Address::ScaleFactor scale1,
7727                                        Address::ScaleFactor scale2, Register index, int ae) {
7728  if (ae == StrIntrinsicNode::LL) {
7729    load_unsigned_byte(elem1, Address(str1, index, scale, 0));
7730    load_unsigned_byte(elem2, Address(str2, index, scale, 0));
7731  } else if (ae == StrIntrinsicNode::UU) {
7732    load_unsigned_short(elem1, Address(str1, index, scale, 0));
7733    load_unsigned_short(elem2, Address(str2, index, scale, 0));
7734  } else {
7735    load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
7736    load_unsigned_short(elem2, Address(str2, index, scale2, 0));
7737  }
7738}
7739
7740// Compare strings, used for char[] and byte[].
7741void MacroAssembler::string_compare(Register str1, Register str2,
7742                                    Register cnt1, Register cnt2, Register result,
7743                                    XMMRegister vec1, int ae) {
7744  ShortBranchVerifier sbv(this);
7745  Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
7746  Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
7747  int stride, stride2, adr_stride, adr_stride1, adr_stride2;
7748  int stride2x2 = 0x40;
7749  Address::ScaleFactor scale = Address::no_scale;
7750  Address::ScaleFactor scale1 = Address::no_scale;
7751  Address::ScaleFactor scale2 = Address::no_scale;
7752
7753  if (ae != StrIntrinsicNode::LL) {
7754    stride2x2 = 0x20;
7755  }
7756
7757  if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
7758    shrl(cnt2, 1);
7759  }
7760  // Compute the minimum of the string lengths and the
7761  // difference of the string lengths (stack).
7762  // Do the conditional move stuff
7763  movl(result, cnt1);
7764  subl(cnt1, cnt2);
7765  push(cnt1);
7766  cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
7767
7768  // Is the minimum length zero?
7769  testl(cnt2, cnt2);
7770  jcc(Assembler::zero, LENGTH_DIFF_LABEL);
7771  if (ae == StrIntrinsicNode::LL) {
7772    // Load first bytes
7773    load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
7774    load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
7775  } else if (ae == StrIntrinsicNode::UU) {
7776    // Load first characters
7777    load_unsigned_short(result, Address(str1, 0));
7778    load_unsigned_short(cnt1, Address(str2, 0));
7779  } else {
7780    load_unsigned_byte(result, Address(str1, 0));
7781    load_unsigned_short(cnt1, Address(str2, 0));
7782  }
7783  subl(result, cnt1);
7784  jcc(Assembler::notZero,  POP_LABEL);
7785
7786  if (ae == StrIntrinsicNode::UU) {
7787    // Divide length by 2 to get number of chars
7788    shrl(cnt2, 1);
7789  }
7790  cmpl(cnt2, 1);
7791  jcc(Assembler::equal, LENGTH_DIFF_LABEL);
7792
7793  // Check if the strings start at the same location and setup scale and stride
7794  if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7795    cmpptr(str1, str2);
7796    jcc(Assembler::equal, LENGTH_DIFF_LABEL);
7797    if (ae == StrIntrinsicNode::LL) {
7798      scale = Address::times_1;
7799      stride = 16;
7800    } else {
7801      scale = Address::times_2;
7802      stride = 8;
7803    }
7804  } else {
7805    scale1 = Address::times_1;
7806    scale2 = Address::times_2;
7807    // scale not used
7808    stride = 8;
7809  }
7810
7811  if (UseAVX >= 2 && UseSSE42Intrinsics) {
7812    Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
7813    Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
7814    Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
7815    Label COMPARE_TAIL_LONG;
7816    Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
7817
7818    int pcmpmask = 0x19;
7819    if (ae == StrIntrinsicNode::LL) {
7820      pcmpmask &= ~0x01;
7821    }
7822
7823    // Setup to compare 16-chars (32-bytes) vectors,
7824    // start from first character again because it has aligned address.
7825    if (ae == StrIntrinsicNode::LL) {
7826      stride2 = 32;
7827    } else {
7828      stride2 = 16;
7829    }
7830    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7831      adr_stride = stride << scale;
7832    } else {
7833      adr_stride1 = 8;  //stride << scale1;
7834      adr_stride2 = 16; //stride << scale2;
7835    }
7836
7837    assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
7838    // rax and rdx are used by pcmpestri as elements counters
7839    movl(result, cnt2);
7840    andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
7841    jcc(Assembler::zero, COMPARE_TAIL_LONG);
7842
7843    // fast path : compare first 2 8-char vectors.
7844    bind(COMPARE_16_CHARS);
7845    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7846      movdqu(vec1, Address(str1, 0));
7847    } else {
7848      pmovzxbw(vec1, Address(str1, 0));
7849    }
7850    pcmpestri(vec1, Address(str2, 0), pcmpmask);
7851    jccb(Assembler::below, COMPARE_INDEX_CHAR);
7852
7853    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7854      movdqu(vec1, Address(str1, adr_stride));
7855      pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
7856    } else {
7857      pmovzxbw(vec1, Address(str1, adr_stride1));
7858      pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
7859    }
7860    jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
7861    addl(cnt1, stride);
7862
7863    // Compare the characters at index in cnt1
7864    bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
7865    load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
7866    subl(result, cnt2);
7867    jmp(POP_LABEL);
7868
7869    // Setup the registers to start vector comparison loop
7870    bind(COMPARE_WIDE_VECTORS);
7871    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7872      lea(str1, Address(str1, result, scale));
7873      lea(str2, Address(str2, result, scale));
7874    } else {
7875      lea(str1, Address(str1, result, scale1));
7876      lea(str2, Address(str2, result, scale2));
7877    }
7878    subl(result, stride2);
7879    subl(cnt2, stride2);
7880    jcc(Assembler::zero, COMPARE_WIDE_TAIL);
7881    negptr(result);
7882
7883    //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
7884    bind(COMPARE_WIDE_VECTORS_LOOP);
7885
7886#ifdef _LP64
7887    if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
7888      cmpl(cnt2, stride2x2);
7889      jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
7890      testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
7891      jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
7892
7893      bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
7894      if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7895        evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
7896        evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
7897      } else {
7898        vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
7899        evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
7900      }
7901      kortestql(k7, k7);
7902      jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
7903      addptr(result, stride2x2);  // update since we already compared at this addr
7904      subl(cnt2, stride2x2);      // and sub the size too
7905      jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
7906
7907      vpxor(vec1, vec1);
7908      jmpb(COMPARE_WIDE_TAIL);
7909    }//if (VM_Version::supports_avx512vlbw())
7910#endif // _LP64
7911
7912
7913    bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7914    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7915      vmovdqu(vec1, Address(str1, result, scale));
7916      vpxor(vec1, Address(str2, result, scale));
7917    } else {
7918      vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
7919      vpxor(vec1, Address(str2, result, scale2));
7920    }
7921    vptest(vec1, vec1);
7922    jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
7923    addptr(result, stride2);
7924    subl(cnt2, stride2);
7925    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
7926    // clean upper bits of YMM registers
7927    vpxor(vec1, vec1);
7928
7929    // compare wide vectors tail
7930    bind(COMPARE_WIDE_TAIL);
7931    testptr(result, result);
7932    jcc(Assembler::zero, LENGTH_DIFF_LABEL);
7933
7934    movl(result, stride2);
7935    movl(cnt2, result);
7936    negptr(result);
7937    jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7938
7939    // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
7940    bind(VECTOR_NOT_EQUAL);
7941    // clean upper bits of YMM registers
7942    vpxor(vec1, vec1);
7943    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7944      lea(str1, Address(str1, result, scale));
7945      lea(str2, Address(str2, result, scale));
7946    } else {
7947      lea(str1, Address(str1, result, scale1));
7948      lea(str2, Address(str2, result, scale2));
7949    }
7950    jmp(COMPARE_16_CHARS);
7951
7952    // Compare tail chars, length between 1 to 15 chars
7953    bind(COMPARE_TAIL_LONG);
7954    movl(cnt2, result);
7955    cmpl(cnt2, stride);
7956    jcc(Assembler::less, COMPARE_SMALL_STR);
7957
7958    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7959      movdqu(vec1, Address(str1, 0));
7960    } else {
7961      pmovzxbw(vec1, Address(str1, 0));
7962    }
7963    pcmpestri(vec1, Address(str2, 0), pcmpmask);
7964    jcc(Assembler::below, COMPARE_INDEX_CHAR);
7965    subptr(cnt2, stride);
7966    jcc(Assembler::zero, LENGTH_DIFF_LABEL);
7967    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7968      lea(str1, Address(str1, result, scale));
7969      lea(str2, Address(str2, result, scale));
7970    } else {
7971      lea(str1, Address(str1, result, scale1));
7972      lea(str2, Address(str2, result, scale2));
7973    }
7974    negptr(cnt2);
7975    jmpb(WHILE_HEAD_LABEL);
7976
7977    bind(COMPARE_SMALL_STR);
7978  } else if (UseSSE42Intrinsics) {
7979    Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
7980    int pcmpmask = 0x19;
7981    // Setup to compare 8-char (16-byte) vectors,
7982    // start from first character again because it has aligned address.
7983    movl(result, cnt2);
7984    andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
7985    if (ae == StrIntrinsicNode::LL) {
7986      pcmpmask &= ~0x01;
7987    }
7988    jcc(Assembler::zero, COMPARE_TAIL);
7989    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7990      lea(str1, Address(str1, result, scale));
7991      lea(str2, Address(str2, result, scale));
7992    } else {
7993      lea(str1, Address(str1, result, scale1));
7994      lea(str2, Address(str2, result, scale2));
7995    }
7996    negptr(result);
7997
7998    // pcmpestri
7999    //   inputs:
8000    //     vec1- substring
8001    //     rax - negative string length (elements count)
8002    //     mem - scanned string
8003    //     rdx - string length (elements count)
8004    //     pcmpmask - cmp mode: 11000 (string compare with negated result)
8005    //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
8006    //   outputs:
8007    //     rcx - first mismatched element index
8008    assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
8009
8010    bind(COMPARE_WIDE_VECTORS);
8011    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8012      movdqu(vec1, Address(str1, result, scale));
8013      pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
8014    } else {
8015      pmovzxbw(vec1, Address(str1, result, scale1));
8016      pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
8017    }
8018    // After pcmpestri cnt1(rcx) contains mismatched element index
8019
8020    jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
8021    addptr(result, stride);
8022    subptr(cnt2, stride);
8023    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
8024
8025    // compare wide vectors tail
8026    testptr(result, result);
8027    jcc(Assembler::zero, LENGTH_DIFF_LABEL);
8028
8029    movl(cnt2, stride);
8030    movl(result, stride);
8031    negptr(result);
8032    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8033      movdqu(vec1, Address(str1, result, scale));
8034      pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
8035    } else {
8036      pmovzxbw(vec1, Address(str1, result, scale1));
8037      pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
8038    }
8039    jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
8040
8041    // Mismatched characters in the vectors
8042    bind(VECTOR_NOT_EQUAL);
8043    addptr(cnt1, result);
8044    load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
8045    subl(result, cnt2);
8046    jmpb(POP_LABEL);
8047
8048    bind(COMPARE_TAIL); // limit is zero
8049    movl(cnt2, result);
8050    // Fallthru to tail compare
8051  }
8052  // Shift str2 and str1 to the end of the arrays, negate min
8053  if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8054    lea(str1, Address(str1, cnt2, scale));
8055    lea(str2, Address(str2, cnt2, scale));
8056  } else {
8057    lea(str1, Address(str1, cnt2, scale1));
8058    lea(str2, Address(str2, cnt2, scale2));
8059  }
8060  decrementl(cnt2);  // first character was compared already
8061  negptr(cnt2);
8062
8063  // Compare the rest of the elements
8064  bind(WHILE_HEAD_LABEL);
8065  load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
8066  subl(result, cnt1);
8067  jccb(Assembler::notZero, POP_LABEL);
8068  increment(cnt2);
8069  jccb(Assembler::notZero, WHILE_HEAD_LABEL);
8070
8071  // Strings are equal up to min length.  Return the length difference.
8072  bind(LENGTH_DIFF_LABEL);
8073  pop(result);
8074  if (ae == StrIntrinsicNode::UU) {
8075    // Divide diff by 2 to get number of chars
8076    sarl(result, 1);
8077  }
8078  jmpb(DONE_LABEL);
8079
8080#ifdef _LP64
8081  if (VM_Version::supports_avx512vlbw()) {
8082
8083    bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
8084
8085    kmovql(cnt1, k7);
8086    notq(cnt1);
8087    bsfq(cnt2, cnt1);
8088    if (ae != StrIntrinsicNode::LL) {
8089      // Divide diff by 2 to get number of chars
8090      sarl(cnt2, 1);
8091    }
8092    addq(result, cnt2);
8093    if (ae == StrIntrinsicNode::LL) {
8094      load_unsigned_byte(cnt1, Address(str2, result));
8095      load_unsigned_byte(result, Address(str1, result));
8096    } else if (ae == StrIntrinsicNode::UU) {
8097      load_unsigned_short(cnt1, Address(str2, result, scale));
8098      load_unsigned_short(result, Address(str1, result, scale));
8099    } else {
8100      load_unsigned_short(cnt1, Address(str2, result, scale2));
8101      load_unsigned_byte(result, Address(str1, result, scale1));
8102    }
8103    subl(result, cnt1);
8104    jmpb(POP_LABEL);
8105  }//if (VM_Version::supports_avx512vlbw())
8106#endif // _LP64
8107
8108  // Discard the stored length difference
8109  bind(POP_LABEL);
8110  pop(cnt1);
8111
8112  // That's it
8113  bind(DONE_LABEL);
8114  if(ae == StrIntrinsicNode::UL) {
8115    negl(result);
8116  }
8117
8118}
8119
8120// Search for Non-ASCII character (Negative byte value) in a byte array,
8121// return true if it has any and false otherwise.
8122//   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
8123//   @HotSpotIntrinsicCandidate
8124//   private static boolean hasNegatives(byte[] ba, int off, int len) {
8125//     for (int i = off; i < off + len; i++) {
8126//       if (ba[i] < 0) {
8127//         return true;
8128//       }
8129//     }
8130//     return false;
8131//   }
8132void MacroAssembler::has_negatives(Register ary1, Register len,
8133  Register result, Register tmp1,
8134  XMMRegister vec1, XMMRegister vec2) {
8135  // rsi: byte array
8136  // rcx: len
8137  // rax: result
8138  ShortBranchVerifier sbv(this);
8139  assert_different_registers(ary1, len, result, tmp1);
8140  assert_different_registers(vec1, vec2);
8141  Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
8142
8143  // len == 0
8144  testl(len, len);
8145  jcc(Assembler::zero, FALSE_LABEL);
8146
8147  if ((UseAVX > 2) && // AVX512
8148    VM_Version::supports_avx512vlbw() &&
8149    VM_Version::supports_bmi2()) {
8150
8151    set_vector_masking();  // opening of the stub context for programming mask registers
8152
8153    Label test_64_loop, test_tail;
8154    Register tmp3_aliased = len;
8155
8156    movl(tmp1, len);
8157    vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
8158
8159    andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
8160    andl(len, ~(64 - 1));    // vector count (in chars)
8161    jccb(Assembler::zero, test_tail);
8162
8163    lea(ary1, Address(ary1, len, Address::times_1));
8164    negptr(len);
8165
8166    bind(test_64_loop);
8167    // Check whether our 64 elements of size byte contain negatives
8168    evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
8169    kortestql(k2, k2);
8170    jcc(Assembler::notZero, TRUE_LABEL);
8171
8172    addptr(len, 64);
8173    jccb(Assembler::notZero, test_64_loop);
8174
8175
8176    bind(test_tail);
8177    // bail out when there is nothing to be done
8178    testl(tmp1, -1);
8179    jcc(Assembler::zero, FALSE_LABEL);
8180
8181    // Save k1
8182    kmovql(k3, k1);
8183
8184    // ~(~0 << len) applied up to two times (for 32-bit scenario)
8185#ifdef _LP64
8186    mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
8187    shlxq(tmp3_aliased, tmp3_aliased, tmp1);
8188    notq(tmp3_aliased);
8189    kmovql(k1, tmp3_aliased);
8190#else
8191    Label k_init;
8192    jmp(k_init);
8193
8194    // We could not read 64-bits from a general purpose register thus we move
8195    // data required to compose 64 1's to the instruction stream
8196    // We emit 64 byte wide series of elements from 0..63 which later on would
8197    // be used as a compare targets with tail count contained in tmp1 register.
8198    // Result would be a k1 register having tmp1 consecutive number or 1
8199    // counting from least significant bit.
8200    address tmp = pc();
8201    emit_int64(0x0706050403020100);
8202    emit_int64(0x0F0E0D0C0B0A0908);
8203    emit_int64(0x1716151413121110);
8204    emit_int64(0x1F1E1D1C1B1A1918);
8205    emit_int64(0x2726252423222120);
8206    emit_int64(0x2F2E2D2C2B2A2928);
8207    emit_int64(0x3736353433323130);
8208    emit_int64(0x3F3E3D3C3B3A3938);
8209
8210    bind(k_init);
8211    lea(len, InternalAddress(tmp));
8212    // create mask to test for negative byte inside a vector
8213    evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
8214    evpcmpgtb(k1, vec1, Address(len, 0), Assembler::AVX_512bit);
8215
8216#endif
8217    evpcmpgtb(k2, k1, vec2, Address(ary1, 0), Assembler::AVX_512bit);
8218    ktestq(k2, k1);
8219    // Restore k1
8220    kmovql(k1, k3);
8221    jcc(Assembler::notZero, TRUE_LABEL);
8222
8223    jmp(FALSE_LABEL);
8224
8225    clear_vector_masking();   // closing of the stub context for programming mask registers
8226  } else {
8227    movl(result, len); // copy
8228
8229    if (UseAVX == 2 && UseSSE >= 2) {
8230      // With AVX2, use 32-byte vector compare
8231      Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8232
8233      // Compare 32-byte vectors
8234      andl(result, 0x0000001f);  //   tail count (in bytes)
8235      andl(len, 0xffffffe0);   // vector count (in bytes)
8236      jccb(Assembler::zero, COMPARE_TAIL);
8237
8238      lea(ary1, Address(ary1, len, Address::times_1));
8239      negptr(len);
8240
8241      movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
8242      movdl(vec2, tmp1);
8243      vpbroadcastd(vec2, vec2);
8244
8245      bind(COMPARE_WIDE_VECTORS);
8246      vmovdqu(vec1, Address(ary1, len, Address::times_1));
8247      vptest(vec1, vec2);
8248      jccb(Assembler::notZero, TRUE_LABEL);
8249      addptr(len, 32);
8250      jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8251
8252      testl(result, result);
8253      jccb(Assembler::zero, FALSE_LABEL);
8254
8255      vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
8256      vptest(vec1, vec2);
8257      jccb(Assembler::notZero, TRUE_LABEL);
8258      jmpb(FALSE_LABEL);
8259
8260      bind(COMPARE_TAIL); // len is zero
8261      movl(len, result);
8262      // Fallthru to tail compare
8263    } else if (UseSSE42Intrinsics) {
8264      // With SSE4.2, use double quad vector compare
8265      Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8266
8267      // Compare 16-byte vectors
8268      andl(result, 0x0000000f);  //   tail count (in bytes)
8269      andl(len, 0xfffffff0);   // vector count (in bytes)
8270      jccb(Assembler::zero, COMPARE_TAIL);
8271
8272      lea(ary1, Address(ary1, len, Address::times_1));
8273      negptr(len);
8274
8275      movl(tmp1, 0x80808080);
8276      movdl(vec2, tmp1);
8277      pshufd(vec2, vec2, 0);
8278
8279      bind(COMPARE_WIDE_VECTORS);
8280      movdqu(vec1, Address(ary1, len, Address::times_1));
8281      ptest(vec1, vec2);
8282      jccb(Assembler::notZero, TRUE_LABEL);
8283      addptr(len, 16);
8284      jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8285
8286      testl(result, result);
8287      jccb(Assembler::zero, FALSE_LABEL);
8288
8289      movdqu(vec1, Address(ary1, result, Address::times_1, -16));
8290      ptest(vec1, vec2);
8291      jccb(Assembler::notZero, TRUE_LABEL);
8292      jmpb(FALSE_LABEL);
8293
8294      bind(COMPARE_TAIL); // len is zero
8295      movl(len, result);
8296      // Fallthru to tail compare
8297    }
8298  }
8299  // Compare 4-byte vectors
8300  andl(len, 0xfffffffc); // vector count (in bytes)
8301  jccb(Assembler::zero, COMPARE_CHAR);
8302
8303  lea(ary1, Address(ary1, len, Address::times_1));
8304  negptr(len);
8305
8306  bind(COMPARE_VECTORS);
8307  movl(tmp1, Address(ary1, len, Address::times_1));
8308  andl(tmp1, 0x80808080);
8309  jccb(Assembler::notZero, TRUE_LABEL);
8310  addptr(len, 4);
8311  jcc(Assembler::notZero, COMPARE_VECTORS);
8312
8313  // Compare trailing char (final 2 bytes), if any
8314  bind(COMPARE_CHAR);
8315  testl(result, 0x2);   // tail  char
8316  jccb(Assembler::zero, COMPARE_BYTE);
8317  load_unsigned_short(tmp1, Address(ary1, 0));
8318  andl(tmp1, 0x00008080);
8319  jccb(Assembler::notZero, TRUE_LABEL);
8320  subptr(result, 2);
8321  lea(ary1, Address(ary1, 2));
8322
8323  bind(COMPARE_BYTE);
8324  testl(result, 0x1);   // tail  byte
8325  jccb(Assembler::zero, FALSE_LABEL);
8326  load_unsigned_byte(tmp1, Address(ary1, 0));
8327  andl(tmp1, 0x00000080);
8328  jccb(Assembler::notEqual, TRUE_LABEL);
8329  jmpb(FALSE_LABEL);
8330
8331  bind(TRUE_LABEL);
8332  movl(result, 1);   // return true
8333  jmpb(DONE);
8334
8335  bind(FALSE_LABEL);
8336  xorl(result, result); // return false
8337
8338  // That's it
8339  bind(DONE);
8340  if (UseAVX >= 2 && UseSSE >= 2) {
8341    // clean upper bits of YMM registers
8342    vpxor(vec1, vec1);
8343    vpxor(vec2, vec2);
8344  }
8345}
8346// Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
8347void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
8348                                   Register limit, Register result, Register chr,
8349                                   XMMRegister vec1, XMMRegister vec2, bool is_char) {
8350  ShortBranchVerifier sbv(this);
8351  Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
8352
8353  int length_offset  = arrayOopDesc::length_offset_in_bytes();
8354  int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
8355
8356  if (is_array_equ) {
8357    // Check the input args
8358    cmpptr(ary1, ary2);
8359    jcc(Assembler::equal, TRUE_LABEL);
8360
8361    // Need additional checks for arrays_equals.
8362    testptr(ary1, ary1);
8363    jcc(Assembler::zero, FALSE_LABEL);
8364    testptr(ary2, ary2);
8365    jcc(Assembler::zero, FALSE_LABEL);
8366
8367    // Check the lengths
8368    movl(limit, Address(ary1, length_offset));
8369    cmpl(limit, Address(ary2, length_offset));
8370    jcc(Assembler::notEqual, FALSE_LABEL);
8371  }
8372
8373  // count == 0
8374  testl(limit, limit);
8375  jcc(Assembler::zero, TRUE_LABEL);
8376
8377  if (is_array_equ) {
8378    // Load array address
8379    lea(ary1, Address(ary1, base_offset));
8380    lea(ary2, Address(ary2, base_offset));
8381  }
8382
8383  if (is_array_equ && is_char) {
8384    // arrays_equals when used for char[].
8385    shll(limit, 1);      // byte count != 0
8386  }
8387  movl(result, limit); // copy
8388
8389  if (UseAVX >= 2) {
8390    // With AVX2, use 32-byte vector compare
8391    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8392
8393    // Compare 32-byte vectors
8394    andl(result, 0x0000001f);  //   tail count (in bytes)
8395    andl(limit, 0xffffffe0);   // vector count (in bytes)
8396    jcc(Assembler::zero, COMPARE_TAIL);
8397
8398    lea(ary1, Address(ary1, limit, Address::times_1));
8399    lea(ary2, Address(ary2, limit, Address::times_1));
8400    negptr(limit);
8401
8402    bind(COMPARE_WIDE_VECTORS);
8403
8404#ifdef _LP64
8405    if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
8406      Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
8407
8408      cmpl(limit, -64);
8409      jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
8410
8411      bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
8412
8413      evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
8414      evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
8415      kortestql(k7, k7);
8416      jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
8417      addptr(limit, 64);  // update since we already compared at this addr
8418      cmpl(limit, -64);
8419      jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
8420
8421      // At this point we may still need to compare -limit+result bytes.
8422      // We could execute the next two instruction and just continue via non-wide path:
8423      //  cmpl(limit, 0);
8424      //  jcc(Assembler::equal, COMPARE_TAIL);  // true
8425      // But since we stopped at the points ary{1,2}+limit which are
8426      // not farther than 64 bytes from the ends of arrays ary{1,2}+result
8427      // (|limit| <= 32 and result < 32),
8428      // we may just compare the last 64 bytes.
8429      //
8430      addptr(result, -64);   // it is safe, bc we just came from this area
8431      evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
8432      evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
8433      kortestql(k7, k7);
8434      jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
8435
8436      jmp(TRUE_LABEL);
8437
8438      bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
8439
8440    }//if (VM_Version::supports_avx512vlbw())
8441#endif //_LP64
8442
8443    vmovdqu(vec1, Address(ary1, limit, Address::times_1));
8444    vmovdqu(vec2, Address(ary2, limit, Address::times_1));
8445    vpxor(vec1, vec2);
8446
8447    vptest(vec1, vec1);
8448    jcc(Assembler::notZero, FALSE_LABEL);
8449    addptr(limit, 32);
8450    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8451
8452    testl(result, result);
8453    jcc(Assembler::zero, TRUE_LABEL);
8454
8455    vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
8456    vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
8457    vpxor(vec1, vec2);
8458
8459    vptest(vec1, vec1);
8460    jccb(Assembler::notZero, FALSE_LABEL);
8461    jmpb(TRUE_LABEL);
8462
8463    bind(COMPARE_TAIL); // limit is zero
8464    movl(limit, result);
8465    // Fallthru to tail compare
8466  } else if (UseSSE42Intrinsics) {
8467    // With SSE4.2, use double quad vector compare
8468    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8469
8470    // Compare 16-byte vectors
8471    andl(result, 0x0000000f);  //   tail count (in bytes)
8472    andl(limit, 0xfffffff0);   // vector count (in bytes)
8473    jcc(Assembler::zero, COMPARE_TAIL);
8474
8475    lea(ary1, Address(ary1, limit, Address::times_1));
8476    lea(ary2, Address(ary2, limit, Address::times_1));
8477    negptr(limit);
8478
8479    bind(COMPARE_WIDE_VECTORS);
8480    movdqu(vec1, Address(ary1, limit, Address::times_1));
8481    movdqu(vec2, Address(ary2, limit, Address::times_1));
8482    pxor(vec1, vec2);
8483
8484    ptest(vec1, vec1);
8485    jcc(Assembler::notZero, FALSE_LABEL);
8486    addptr(limit, 16);
8487    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8488
8489    testl(result, result);
8490    jcc(Assembler::zero, TRUE_LABEL);
8491
8492    movdqu(vec1, Address(ary1, result, Address::times_1, -16));
8493    movdqu(vec2, Address(ary2, result, Address::times_1, -16));
8494    pxor(vec1, vec2);
8495
8496    ptest(vec1, vec1);
8497    jccb(Assembler::notZero, FALSE_LABEL);
8498    jmpb(TRUE_LABEL);
8499
8500    bind(COMPARE_TAIL); // limit is zero
8501    movl(limit, result);
8502    // Fallthru to tail compare
8503  }
8504
8505  // Compare 4-byte vectors
8506  andl(limit, 0xfffffffc); // vector count (in bytes)
8507  jccb(Assembler::zero, COMPARE_CHAR);
8508
8509  lea(ary1, Address(ary1, limit, Address::times_1));
8510  lea(ary2, Address(ary2, limit, Address::times_1));
8511  negptr(limit);
8512
8513  bind(COMPARE_VECTORS);
8514  movl(chr, Address(ary1, limit, Address::times_1));
8515  cmpl(chr, Address(ary2, limit, Address::times_1));
8516  jccb(Assembler::notEqual, FALSE_LABEL);
8517  addptr(limit, 4);
8518  jcc(Assembler::notZero, COMPARE_VECTORS);
8519
8520  // Compare trailing char (final 2 bytes), if any
8521  bind(COMPARE_CHAR);
8522  testl(result, 0x2);   // tail  char
8523  jccb(Assembler::zero, COMPARE_BYTE);
8524  load_unsigned_short(chr, Address(ary1, 0));
8525  load_unsigned_short(limit, Address(ary2, 0));
8526  cmpl(chr, limit);
8527  jccb(Assembler::notEqual, FALSE_LABEL);
8528
8529  if (is_array_equ && is_char) {
8530    bind(COMPARE_BYTE);
8531  } else {
8532    lea(ary1, Address(ary1, 2));
8533    lea(ary2, Address(ary2, 2));
8534
8535    bind(COMPARE_BYTE);
8536    testl(result, 0x1);   // tail  byte
8537    jccb(Assembler::zero, TRUE_LABEL);
8538    load_unsigned_byte(chr, Address(ary1, 0));
8539    load_unsigned_byte(limit, Address(ary2, 0));
8540    cmpl(chr, limit);
8541    jccb(Assembler::notEqual, FALSE_LABEL);
8542  }
8543  bind(TRUE_LABEL);
8544  movl(result, 1);   // return true
8545  jmpb(DONE);
8546
8547  bind(FALSE_LABEL);
8548  xorl(result, result); // return false
8549
8550  // That's it
8551  bind(DONE);
8552  if (UseAVX >= 2) {
8553    // clean upper bits of YMM registers
8554    vpxor(vec1, vec1);
8555    vpxor(vec2, vec2);
8556  }
8557}
8558
8559#endif
8560
8561void MacroAssembler::generate_fill(BasicType t, bool aligned,
8562                                   Register to, Register value, Register count,
8563                                   Register rtmp, XMMRegister xtmp) {
8564  ShortBranchVerifier sbv(this);
8565  assert_different_registers(to, value, count, rtmp);
8566  Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
8567  Label L_fill_2_bytes, L_fill_4_bytes;
8568
8569  int shift = -1;
8570  switch (t) {
8571    case T_BYTE:
8572      shift = 2;
8573      break;
8574    case T_SHORT:
8575      shift = 1;
8576      break;
8577    case T_INT:
8578      shift = 0;
8579      break;
8580    default: ShouldNotReachHere();
8581  }
8582
8583  if (t == T_BYTE) {
8584    andl(value, 0xff);
8585    movl(rtmp, value);
8586    shll(rtmp, 8);
8587    orl(value, rtmp);
8588  }
8589  if (t == T_SHORT) {
8590    andl(value, 0xffff);
8591  }
8592  if (t == T_BYTE || t == T_SHORT) {
8593    movl(rtmp, value);
8594    shll(rtmp, 16);
8595    orl(value, rtmp);
8596  }
8597
8598  cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
8599  jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
8600  if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
8601    // align source address at 4 bytes address boundary
8602    if (t == T_BYTE) {
8603      // One byte misalignment happens only for byte arrays
8604      testptr(to, 1);
8605      jccb(Assembler::zero, L_skip_align1);
8606      movb(Address(to, 0), value);
8607      increment(to);
8608      decrement(count);
8609      BIND(L_skip_align1);
8610    }
8611    // Two bytes misalignment happens only for byte and short (char) arrays
8612    testptr(to, 2);
8613    jccb(Assembler::zero, L_skip_align2);
8614    movw(Address(to, 0), value);
8615    addptr(to, 2);
8616    subl(count, 1<<(shift-1));
8617    BIND(L_skip_align2);
8618  }
8619  if (UseSSE < 2) {
8620    Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
8621    // Fill 32-byte chunks
8622    subl(count, 8 << shift);
8623    jcc(Assembler::less, L_check_fill_8_bytes);
8624    align(16);
8625
8626    BIND(L_fill_32_bytes_loop);
8627
8628    for (int i = 0; i < 32; i += 4) {
8629      movl(Address(to, i), value);
8630    }
8631
8632    addptr(to, 32);
8633    subl(count, 8 << shift);
8634    jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
8635    BIND(L_check_fill_8_bytes);
8636    addl(count, 8 << shift);
8637    jccb(Assembler::zero, L_exit);
8638    jmpb(L_fill_8_bytes);
8639
8640    //
8641    // length is too short, just fill qwords
8642    //
8643    BIND(L_fill_8_bytes_loop);
8644    movl(Address(to, 0), value);
8645    movl(Address(to, 4), value);
8646    addptr(to, 8);
8647    BIND(L_fill_8_bytes);
8648    subl(count, 1 << (shift + 1));
8649    jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
8650    // fall through to fill 4 bytes
8651  } else {
8652    Label L_fill_32_bytes;
8653    if (!UseUnalignedLoadStores) {
8654      // align to 8 bytes, we know we are 4 byte aligned to start
8655      testptr(to, 4);
8656      jccb(Assembler::zero, L_fill_32_bytes);
8657      movl(Address(to, 0), value);
8658      addptr(to, 4);
8659      subl(count, 1<<shift);
8660    }
8661    BIND(L_fill_32_bytes);
8662    {
8663      assert( UseSSE >= 2, "supported cpu only" );
8664      Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
8665      if (UseAVX > 2) {
8666        movl(rtmp, 0xffff);
8667        kmovwl(k1, rtmp);
8668      }
8669      movdl(xtmp, value);
8670      if (UseAVX > 2 && UseUnalignedLoadStores) {
8671        // Fill 64-byte chunks
8672        Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8673        evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
8674
8675        subl(count, 16 << shift);
8676        jcc(Assembler::less, L_check_fill_32_bytes);
8677        align(16);
8678
8679        BIND(L_fill_64_bytes_loop);
8680        evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
8681        addptr(to, 64);
8682        subl(count, 16 << shift);
8683        jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8684
8685        BIND(L_check_fill_32_bytes);
8686        addl(count, 8 << shift);
8687        jccb(Assembler::less, L_check_fill_8_bytes);
8688        vmovdqu(Address(to, 0), xtmp);
8689        addptr(to, 32);
8690        subl(count, 8 << shift);
8691
8692        BIND(L_check_fill_8_bytes);
8693      } else if (UseAVX == 2 && UseUnalignedLoadStores) {
8694        // Fill 64-byte chunks
8695        Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8696        vpbroadcastd(xtmp, xtmp);
8697
8698        subl(count, 16 << shift);
8699        jcc(Assembler::less, L_check_fill_32_bytes);
8700        align(16);
8701
8702        BIND(L_fill_64_bytes_loop);
8703        vmovdqu(Address(to, 0), xtmp);
8704        vmovdqu(Address(to, 32), xtmp);
8705        addptr(to, 64);
8706        subl(count, 16 << shift);
8707        jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8708
8709        BIND(L_check_fill_32_bytes);
8710        addl(count, 8 << shift);
8711        jccb(Assembler::less, L_check_fill_8_bytes);
8712        vmovdqu(Address(to, 0), xtmp);
8713        addptr(to, 32);
8714        subl(count, 8 << shift);
8715
8716        BIND(L_check_fill_8_bytes);
8717        // clean upper bits of YMM registers
8718        movdl(xtmp, value);
8719        pshufd(xtmp, xtmp, 0);
8720      } else {
8721        // Fill 32-byte chunks
8722        pshufd(xtmp, xtmp, 0);
8723
8724        subl(count, 8 << shift);
8725        jcc(Assembler::less, L_check_fill_8_bytes);
8726        align(16);
8727
8728        BIND(L_fill_32_bytes_loop);
8729
8730        if (UseUnalignedLoadStores) {
8731          movdqu(Address(to, 0), xtmp);
8732          movdqu(Address(to, 16), xtmp);
8733        } else {
8734          movq(Address(to, 0), xtmp);
8735          movq(Address(to, 8), xtmp);
8736          movq(Address(to, 16), xtmp);
8737          movq(Address(to, 24), xtmp);
8738        }
8739
8740        addptr(to, 32);
8741        subl(count, 8 << shift);
8742        jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
8743
8744        BIND(L_check_fill_8_bytes);
8745      }
8746      addl(count, 8 << shift);
8747      jccb(Assembler::zero, L_exit);
8748      jmpb(L_fill_8_bytes);
8749
8750      //
8751      // length is too short, just fill qwords
8752      //
8753      BIND(L_fill_8_bytes_loop);
8754      movq(Address(to, 0), xtmp);
8755      addptr(to, 8);
8756      BIND(L_fill_8_bytes);
8757      subl(count, 1 << (shift + 1));
8758      jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
8759    }
8760  }
8761  // fill trailing 4 bytes
8762  BIND(L_fill_4_bytes);
8763  testl(count, 1<<shift);
8764  jccb(Assembler::zero, L_fill_2_bytes);
8765  movl(Address(to, 0), value);
8766  if (t == T_BYTE || t == T_SHORT) {
8767    addptr(to, 4);
8768    BIND(L_fill_2_bytes);
8769    // fill trailing 2 bytes
8770    testl(count, 1<<(shift-1));
8771    jccb(Assembler::zero, L_fill_byte);
8772    movw(Address(to, 0), value);
8773    if (t == T_BYTE) {
8774      addptr(to, 2);
8775      BIND(L_fill_byte);
8776      // fill trailing byte
8777      testl(count, 1);
8778      jccb(Assembler::zero, L_exit);
8779      movb(Address(to, 0), value);
8780    } else {
8781      BIND(L_fill_byte);
8782    }
8783  } else {
8784    BIND(L_fill_2_bytes);
8785  }
8786  BIND(L_exit);
8787}
8788
8789// encode char[] to byte[] in ISO_8859_1
8790   //@HotSpotIntrinsicCandidate
8791   //private static int implEncodeISOArray(byte[] sa, int sp,
8792   //byte[] da, int dp, int len) {
8793   //  int i = 0;
8794   //  for (; i < len; i++) {
8795   //    char c = StringUTF16.getChar(sa, sp++);
8796   //    if (c > '\u00FF')
8797   //      break;
8798   //    da[dp++] = (byte)c;
8799   //  }
8800   //  return i;
8801   //}
8802void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
8803  XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8804  XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8805  Register tmp5, Register result) {
8806
8807  // rsi: src
8808  // rdi: dst
8809  // rdx: len
8810  // rcx: tmp5
8811  // rax: result
8812  ShortBranchVerifier sbv(this);
8813  assert_different_registers(src, dst, len, tmp5, result);
8814  Label L_done, L_copy_1_char, L_copy_1_char_exit;
8815
8816  // set result
8817  xorl(result, result);
8818  // check for zero length
8819  testl(len, len);
8820  jcc(Assembler::zero, L_done);
8821
8822  movl(result, len);
8823
8824  // Setup pointers
8825  lea(src, Address(src, len, Address::times_2)); // char[]
8826  lea(dst, Address(dst, len, Address::times_1)); // byte[]
8827  negptr(len);
8828
8829  if (UseSSE42Intrinsics || UseAVX >= 2) {
8830    Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
8831    Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
8832
8833    if (UseAVX >= 2) {
8834      Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
8835      movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
8836      movdl(tmp1Reg, tmp5);
8837      vpbroadcastd(tmp1Reg, tmp1Reg);
8838      jmp(L_chars_32_check);
8839
8840      bind(L_copy_32_chars);
8841      vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
8842      vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
8843      vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8844      vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
8845      jccb(Assembler::notZero, L_copy_32_chars_exit);
8846      vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8847      vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
8848      vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
8849
8850      bind(L_chars_32_check);
8851      addptr(len, 32);
8852      jcc(Assembler::lessEqual, L_copy_32_chars);
8853
8854      bind(L_copy_32_chars_exit);
8855      subptr(len, 16);
8856      jccb(Assembler::greater, L_copy_16_chars_exit);
8857
8858    } else if (UseSSE42Intrinsics) {
8859      movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
8860      movdl(tmp1Reg, tmp5);
8861      pshufd(tmp1Reg, tmp1Reg, 0);
8862      jmpb(L_chars_16_check);
8863    }
8864
8865    bind(L_copy_16_chars);
8866    if (UseAVX >= 2) {
8867      vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
8868      vptest(tmp2Reg, tmp1Reg);
8869      jcc(Assembler::notZero, L_copy_16_chars_exit);
8870      vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
8871      vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
8872    } else {
8873      if (UseAVX > 0) {
8874        movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
8875        movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
8876        vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
8877      } else {
8878        movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
8879        por(tmp2Reg, tmp3Reg);
8880        movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
8881        por(tmp2Reg, tmp4Reg);
8882      }
8883      ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
8884      jccb(Assembler::notZero, L_copy_16_chars_exit);
8885      packuswb(tmp3Reg, tmp4Reg);
8886    }
8887    movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
8888
8889    bind(L_chars_16_check);
8890    addptr(len, 16);
8891    jcc(Assembler::lessEqual, L_copy_16_chars);
8892
8893    bind(L_copy_16_chars_exit);
8894    if (UseAVX >= 2) {
8895      // clean upper bits of YMM registers
8896      vpxor(tmp2Reg, tmp2Reg);
8897      vpxor(tmp3Reg, tmp3Reg);
8898      vpxor(tmp4Reg, tmp4Reg);
8899      movdl(tmp1Reg, tmp5);
8900      pshufd(tmp1Reg, tmp1Reg, 0);
8901    }
8902    subptr(len, 8);
8903    jccb(Assembler::greater, L_copy_8_chars_exit);
8904
8905    bind(L_copy_8_chars);
8906    movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
8907    ptest(tmp3Reg, tmp1Reg);
8908    jccb(Assembler::notZero, L_copy_8_chars_exit);
8909    packuswb(tmp3Reg, tmp1Reg);
8910    movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
8911    addptr(len, 8);
8912    jccb(Assembler::lessEqual, L_copy_8_chars);
8913
8914    bind(L_copy_8_chars_exit);
8915    subptr(len, 8);
8916    jccb(Assembler::zero, L_done);
8917  }
8918
8919  bind(L_copy_1_char);
8920  load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
8921  testl(tmp5, 0xff00);      // check if Unicode char
8922  jccb(Assembler::notZero, L_copy_1_char_exit);
8923  movb(Address(dst, len, Address::times_1, 0), tmp5);
8924  addptr(len, 1);
8925  jccb(Assembler::less, L_copy_1_char);
8926
8927  bind(L_copy_1_char_exit);
8928  addptr(result, len); // len is negative count of not processed elements
8929
8930  bind(L_done);
8931}
8932
8933#ifdef _LP64
8934/**
8935 * Helper for multiply_to_len().
8936 */
8937void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
8938  addq(dest_lo, src1);
8939  adcq(dest_hi, 0);
8940  addq(dest_lo, src2);
8941  adcq(dest_hi, 0);
8942}
8943
8944/**
8945 * Multiply 64 bit by 64 bit first loop.
8946 */
8947void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
8948                                           Register y, Register y_idx, Register z,
8949                                           Register carry, Register product,
8950                                           Register idx, Register kdx) {
8951  //
8952  //  jlong carry, x[], y[], z[];
8953  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
8954  //    huge_128 product = y[idx] * x[xstart] + carry;
8955  //    z[kdx] = (jlong)product;
8956  //    carry  = (jlong)(product >>> 64);
8957  //  }
8958  //  z[xstart] = carry;
8959  //
8960
8961  Label L_first_loop, L_first_loop_exit;
8962  Label L_one_x, L_one_y, L_multiply;
8963
8964  decrementl(xstart);
8965  jcc(Assembler::negative, L_one_x);
8966
8967  movq(x_xstart, Address(x, xstart, Address::times_4,  0));
8968  rorq(x_xstart, 32); // convert big-endian to little-endian
8969
8970  bind(L_first_loop);
8971  decrementl(idx);
8972  jcc(Assembler::negative, L_first_loop_exit);
8973  decrementl(idx);
8974  jcc(Assembler::negative, L_one_y);
8975  movq(y_idx, Address(y, idx, Address::times_4,  0));
8976  rorq(y_idx, 32); // convert big-endian to little-endian
8977  bind(L_multiply);
8978  movq(product, x_xstart);
8979  mulq(y_idx); // product(rax) * y_idx -> rdx:rax
8980  addq(product, carry);
8981  adcq(rdx, 0);
8982  subl(kdx, 2);
8983  movl(Address(z, kdx, Address::times_4,  4), product);
8984  shrq(product, 32);
8985  movl(Address(z, kdx, Address::times_4,  0), product);
8986  movq(carry, rdx);
8987  jmp(L_first_loop);
8988
8989  bind(L_one_y);
8990  movl(y_idx, Address(y,  0));
8991  jmp(L_multiply);
8992
8993  bind(L_one_x);
8994  movl(x_xstart, Address(x,  0));
8995  jmp(L_first_loop);
8996
8997  bind(L_first_loop_exit);
8998}
8999
9000/**
9001 * Multiply 64 bit by 64 bit and add 128 bit.
9002 */
9003void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
9004                                            Register yz_idx, Register idx,
9005                                            Register carry, Register product, int offset) {
9006  //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
9007  //     z[kdx] = (jlong)product;
9008
9009  movq(yz_idx, Address(y, idx, Address::times_4,  offset));
9010  rorq(yz_idx, 32); // convert big-endian to little-endian
9011  movq(product, x_xstart);
9012  mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
9013  movq(yz_idx, Address(z, idx, Address::times_4,  offset));
9014  rorq(yz_idx, 32); // convert big-endian to little-endian
9015
9016  add2_with_carry(rdx, product, carry, yz_idx);
9017
9018  movl(Address(z, idx, Address::times_4,  offset+4), product);
9019  shrq(product, 32);
9020  movl(Address(z, idx, Address::times_4,  offset), product);
9021
9022}
9023
9024/**
9025 * Multiply 128 bit by 128 bit. Unrolled inner loop.
9026 */
9027void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
9028                                             Register yz_idx, Register idx, Register jdx,
9029                                             Register carry, Register product,
9030                                             Register carry2) {
9031  //   jlong carry, x[], y[], z[];
9032  //   int kdx = ystart+1;
9033  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
9034  //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
9035  //     z[kdx+idx+1] = (jlong)product;
9036  //     jlong carry2  = (jlong)(product >>> 64);
9037  //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
9038  //     z[kdx+idx] = (jlong)product;
9039  //     carry  = (jlong)(product >>> 64);
9040  //   }
9041  //   idx += 2;
9042  //   if (idx > 0) {
9043  //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
9044  //     z[kdx+idx] = (jlong)product;
9045  //     carry  = (jlong)(product >>> 64);
9046  //   }
9047  //
9048
9049  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
9050
9051  movl(jdx, idx);
9052  andl(jdx, 0xFFFFFFFC);
9053  shrl(jdx, 2);
9054
9055  bind(L_third_loop);
9056  subl(jdx, 1);
9057  jcc(Assembler::negative, L_third_loop_exit);
9058  subl(idx, 4);
9059
9060  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
9061  movq(carry2, rdx);
9062
9063  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
9064  movq(carry, rdx);
9065  jmp(L_third_loop);
9066
9067  bind (L_third_loop_exit);
9068
9069  andl (idx, 0x3);
9070  jcc(Assembler::zero, L_post_third_loop_done);
9071
9072  Label L_check_1;
9073  subl(idx, 2);
9074  jcc(Assembler::negative, L_check_1);
9075
9076  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
9077  movq(carry, rdx);
9078
9079  bind (L_check_1);
9080  addl (idx, 0x2);
9081  andl (idx, 0x1);
9082  subl(idx, 1);
9083  jcc(Assembler::negative, L_post_third_loop_done);
9084
9085  movl(yz_idx, Address(y, idx, Address::times_4,  0));
9086  movq(product, x_xstart);
9087  mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
9088  movl(yz_idx, Address(z, idx, Address::times_4,  0));
9089
9090  add2_with_carry(rdx, product, yz_idx, carry);
9091
9092  movl(Address(z, idx, Address::times_4,  0), product);
9093  shrq(product, 32);
9094
9095  shlq(rdx, 32);
9096  orq(product, rdx);
9097  movq(carry, product);
9098
9099  bind(L_post_third_loop_done);
9100}
9101
9102/**
9103 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
9104 *
9105 */
9106void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
9107                                                  Register carry, Register carry2,
9108                                                  Register idx, Register jdx,
9109                                                  Register yz_idx1, Register yz_idx2,
9110                                                  Register tmp, Register tmp3, Register tmp4) {
9111  assert(UseBMI2Instructions, "should be used only when BMI2 is available");
9112
9113  //   jlong carry, x[], y[], z[];
9114  //   int kdx = ystart+1;
9115  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
9116  //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
9117  //     jlong carry2  = (jlong)(tmp3 >>> 64);
9118  //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
9119  //     carry  = (jlong)(tmp4 >>> 64);
9120  //     z[kdx+idx+1] = (jlong)tmp3;
9121  //     z[kdx+idx] = (jlong)tmp4;
9122  //   }
9123  //   idx += 2;
9124  //   if (idx > 0) {
9125  //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
9126  //     z[kdx+idx] = (jlong)yz_idx1;
9127  //     carry  = (jlong)(yz_idx1 >>> 64);
9128  //   }
9129  //
9130
9131  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
9132
9133  movl(jdx, idx);
9134  andl(jdx, 0xFFFFFFFC);
9135  shrl(jdx, 2);
9136
9137  bind(L_third_loop);
9138  subl(jdx, 1);
9139  jcc(Assembler::negative, L_third_loop_exit);
9140  subl(idx, 4);
9141
9142  movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
9143  rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
9144  movq(yz_idx2, Address(y, idx, Address::times_4,  0));
9145  rorxq(yz_idx2, yz_idx2, 32);
9146
9147  mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
9148  mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
9149
9150  movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
9151  rorxq(yz_idx1, yz_idx1, 32);
9152  movq(yz_idx2, Address(z, idx, Address::times_4,  0));
9153  rorxq(yz_idx2, yz_idx2, 32);
9154
9155  if (VM_Version::supports_adx()) {
9156    adcxq(tmp3, carry);
9157    adoxq(tmp3, yz_idx1);
9158
9159    adcxq(tmp4, tmp);
9160    adoxq(tmp4, yz_idx2);
9161
9162    movl(carry, 0); // does not affect flags
9163    adcxq(carry2, carry);
9164    adoxq(carry2, carry);
9165  } else {
9166    add2_with_carry(tmp4, tmp3, carry, yz_idx1);
9167    add2_with_carry(carry2, tmp4, tmp, yz_idx2);
9168  }
9169  movq(carry, carry2);
9170
9171  movl(Address(z, idx, Address::times_4, 12), tmp3);
9172  shrq(tmp3, 32);
9173  movl(Address(z, idx, Address::times_4,  8), tmp3);
9174
9175  movl(Address(z, idx, Address::times_4,  4), tmp4);
9176  shrq(tmp4, 32);
9177  movl(Address(z, idx, Address::times_4,  0), tmp4);
9178
9179  jmp(L_third_loop);
9180
9181  bind (L_third_loop_exit);
9182
9183  andl (idx, 0x3);
9184  jcc(Assembler::zero, L_post_third_loop_done);
9185
9186  Label L_check_1;
9187  subl(idx, 2);
9188  jcc(Assembler::negative, L_check_1);
9189
9190  movq(yz_idx1, Address(y, idx, Address::times_4,  0));
9191  rorxq(yz_idx1, yz_idx1, 32);
9192  mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
9193  movq(yz_idx2, Address(z, idx, Address::times_4,  0));
9194  rorxq(yz_idx2, yz_idx2, 32);
9195
9196  add2_with_carry(tmp4, tmp3, carry, yz_idx2);
9197
9198  movl(Address(z, idx, Address::times_4,  4), tmp3);
9199  shrq(tmp3, 32);
9200  movl(Address(z, idx, Address::times_4,  0), tmp3);
9201  movq(carry, tmp4);
9202
9203  bind (L_check_1);
9204  addl (idx, 0x2);
9205  andl (idx, 0x1);
9206  subl(idx, 1);
9207  jcc(Assembler::negative, L_post_third_loop_done);
9208  movl(tmp4, Address(y, idx, Address::times_4,  0));
9209  mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
9210  movl(tmp4, Address(z, idx, Address::times_4,  0));
9211
9212  add2_with_carry(carry2, tmp3, tmp4, carry);
9213
9214  movl(Address(z, idx, Address::times_4,  0), tmp3);
9215  shrq(tmp3, 32);
9216
9217  shlq(carry2, 32);
9218  orq(tmp3, carry2);
9219  movq(carry, tmp3);
9220
9221  bind(L_post_third_loop_done);
9222}
9223
9224/**
9225 * Code for BigInteger::multiplyToLen() instrinsic.
9226 *
9227 * rdi: x
9228 * rax: xlen
9229 * rsi: y
9230 * rcx: ylen
9231 * r8:  z
9232 * r11: zlen
9233 * r12: tmp1
9234 * r13: tmp2
9235 * r14: tmp3
9236 * r15: tmp4
9237 * rbx: tmp5
9238 *
9239 */
9240void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
9241                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
9242  ShortBranchVerifier sbv(this);
9243  assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
9244
9245  push(tmp1);
9246  push(tmp2);
9247  push(tmp3);
9248  push(tmp4);
9249  push(tmp5);
9250
9251  push(xlen);
9252  push(zlen);
9253
9254  const Register idx = tmp1;
9255  const Register kdx = tmp2;
9256  const Register xstart = tmp3;
9257
9258  const Register y_idx = tmp4;
9259  const Register carry = tmp5;
9260  const Register product  = xlen;
9261  const Register x_xstart = zlen;  // reuse register
9262
9263  // First Loop.
9264  //
9265  //  final static long LONG_MASK = 0xffffffffL;
9266  //  int xstart = xlen - 1;
9267  //  int ystart = ylen - 1;
9268  //  long carry = 0;
9269  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
9270  //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
9271  //    z[kdx] = (int)product;
9272  //    carry = product >>> 32;
9273  //  }
9274  //  z[xstart] = (int)carry;
9275  //
9276
9277  movl(idx, ylen);      // idx = ylen;
9278  movl(kdx, zlen);      // kdx = xlen+ylen;
9279  xorq(carry, carry);   // carry = 0;
9280
9281  Label L_done;
9282
9283  movl(xstart, xlen);
9284  decrementl(xstart);
9285  jcc(Assembler::negative, L_done);
9286
9287  multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
9288
9289  Label L_second_loop;
9290  testl(kdx, kdx);
9291  jcc(Assembler::zero, L_second_loop);
9292
9293  Label L_carry;
9294  subl(kdx, 1);
9295  jcc(Assembler::zero, L_carry);
9296
9297  movl(Address(z, kdx, Address::times_4,  0), carry);
9298  shrq(carry, 32);
9299  subl(kdx, 1);
9300
9301  bind(L_carry);
9302  movl(Address(z, kdx, Address::times_4,  0), carry);
9303
9304  // Second and third (nested) loops.
9305  //
9306  // for (int i = xstart-1; i >= 0; i--) { // Second loop
9307  //   carry = 0;
9308  //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
9309  //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
9310  //                    (z[k] & LONG_MASK) + carry;
9311  //     z[k] = (int)product;
9312  //     carry = product >>> 32;
9313  //   }
9314  //   z[i] = (int)carry;
9315  // }
9316  //
9317  // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
9318
9319  const Register jdx = tmp1;
9320
9321  bind(L_second_loop);
9322  xorl(carry, carry);    // carry = 0;
9323  movl(jdx, ylen);       // j = ystart+1
9324
9325  subl(xstart, 1);       // i = xstart-1;
9326  jcc(Assembler::negative, L_done);
9327
9328  push (z);
9329
9330  Label L_last_x;
9331  lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
9332  subl(xstart, 1);       // i = xstart-1;
9333  jcc(Assembler::negative, L_last_x);
9334
9335  if (UseBMI2Instructions) {
9336    movq(rdx,  Address(x, xstart, Address::times_4,  0));
9337    rorxq(rdx, rdx, 32); // convert big-endian to little-endian
9338  } else {
9339    movq(x_xstart, Address(x, xstart, Address::times_4,  0));
9340    rorq(x_xstart, 32);  // convert big-endian to little-endian
9341  }
9342
9343  Label L_third_loop_prologue;
9344  bind(L_third_loop_prologue);
9345
9346  push (x);
9347  push (xstart);
9348  push (ylen);
9349
9350
9351  if (UseBMI2Instructions) {
9352    multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
9353  } else { // !UseBMI2Instructions
9354    multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
9355  }
9356
9357  pop(ylen);
9358  pop(xlen);
9359  pop(x);
9360  pop(z);
9361
9362  movl(tmp3, xlen);
9363  addl(tmp3, 1);
9364  movl(Address(z, tmp3, Address::times_4,  0), carry);
9365  subl(tmp3, 1);
9366  jccb(Assembler::negative, L_done);
9367
9368  shrq(carry, 32);
9369  movl(Address(z, tmp3, Address::times_4,  0), carry);
9370  jmp(L_second_loop);
9371
9372  // Next infrequent code is moved outside loops.
9373  bind(L_last_x);
9374  if (UseBMI2Instructions) {
9375    movl(rdx, Address(x,  0));
9376  } else {
9377    movl(x_xstart, Address(x,  0));
9378  }
9379  jmp(L_third_loop_prologue);
9380
9381  bind(L_done);
9382
9383  pop(zlen);
9384  pop(xlen);
9385
9386  pop(tmp5);
9387  pop(tmp4);
9388  pop(tmp3);
9389  pop(tmp2);
9390  pop(tmp1);
9391}
9392
9393void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
9394  Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
9395  assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
9396  Label VECTOR64_LOOP, VECTOR64_TAIL, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
9397  Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
9398  Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL;
9399  Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
9400  Label SAME_TILL_END, DONE;
9401  Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
9402
9403  //scale is in rcx in both Win64 and Unix
9404  ShortBranchVerifier sbv(this);
9405
9406  shlq(length);
9407  xorq(result, result);
9408
9409  if ((UseAVX > 2) &&
9410      VM_Version::supports_avx512vlbw()) {
9411    set_vector_masking();  // opening of the stub context for programming mask registers
9412    cmpq(length, 64);
9413    jcc(Assembler::less, VECTOR32_TAIL);
9414    movq(tmp1, length);
9415    andq(tmp1, 0x3F);      // tail count
9416    andq(length, ~(0x3F)); //vector count
9417
9418    bind(VECTOR64_LOOP);
9419    // AVX512 code to compare 64 byte vectors.
9420    evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
9421    evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
9422    kortestql(k7, k7);
9423    jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
9424    addq(result, 64);
9425    subq(length, 64);
9426    jccb(Assembler::notZero, VECTOR64_LOOP);
9427
9428    //bind(VECTOR64_TAIL);
9429    testq(tmp1, tmp1);
9430    jcc(Assembler::zero, SAME_TILL_END);
9431
9432    bind(VECTOR64_TAIL);
9433    // AVX512 code to compare upto 63 byte vectors.
9434    // Save k1
9435    kmovql(k3, k1);
9436    mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
9437    shlxq(tmp2, tmp2, tmp1);
9438    notq(tmp2);
9439    kmovql(k1, tmp2);
9440
9441    evmovdqub(rymm0, k1, Address(obja, result), Assembler::AVX_512bit);
9442    evpcmpeqb(k7, k1, rymm0, Address(objb, result), Assembler::AVX_512bit);
9443
9444    ktestql(k7, k1);
9445    // Restore k1
9446    kmovql(k1, k3);
9447    jcc(Assembler::below, SAME_TILL_END);     // not mismatch
9448
9449    bind(VECTOR64_NOT_EQUAL);
9450    kmovql(tmp1, k7);
9451    notq(tmp1);
9452    tzcntq(tmp1, tmp1);
9453    addq(result, tmp1);
9454    shrq(result);
9455    jmp(DONE);
9456    bind(VECTOR32_TAIL);
9457    clear_vector_masking();   // closing of the stub context for programming mask registers
9458  }
9459
9460  cmpq(length, 8);
9461  jcc(Assembler::equal, VECTOR8_LOOP);
9462  jcc(Assembler::less, VECTOR4_TAIL);
9463
9464  if (UseAVX >= 2) {
9465
9466    cmpq(length, 16);
9467    jcc(Assembler::equal, VECTOR16_LOOP);
9468    jcc(Assembler::less, VECTOR8_LOOP);
9469
9470    cmpq(length, 32);
9471    jccb(Assembler::less, VECTOR16_TAIL);
9472
9473    subq(length, 32);
9474    bind(VECTOR32_LOOP);
9475    vmovdqu(rymm0, Address(obja, result));
9476    vmovdqu(rymm1, Address(objb, result));
9477    vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
9478    vptest(rymm2, rymm2);
9479    jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
9480    addq(result, 32);
9481    subq(length, 32);
9482    jccb(Assembler::greaterEqual, VECTOR32_LOOP);
9483    addq(length, 32);
9484    jcc(Assembler::equal, SAME_TILL_END);
9485    //falling through if less than 32 bytes left //close the branch here.
9486
9487    bind(VECTOR16_TAIL);
9488    cmpq(length, 16);
9489    jccb(Assembler::less, VECTOR8_TAIL);
9490    bind(VECTOR16_LOOP);
9491    movdqu(rymm0, Address(obja, result));
9492    movdqu(rymm1, Address(objb, result));
9493    vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
9494    ptest(rymm2, rymm2);
9495    jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
9496    addq(result, 16);
9497    subq(length, 16);
9498    jcc(Assembler::equal, SAME_TILL_END);
9499    //falling through if less than 16 bytes left
9500  } else {//regular intrinsics
9501
9502    cmpq(length, 16);
9503    jccb(Assembler::less, VECTOR8_TAIL);
9504
9505    subq(length, 16);
9506    bind(VECTOR16_LOOP);
9507    movdqu(rymm0, Address(obja, result));
9508    movdqu(rymm1, Address(objb, result));
9509    pxor(rymm0, rymm1);
9510    ptest(rymm0, rymm0);
9511    jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
9512    addq(result, 16);
9513    subq(length, 16);
9514    jccb(Assembler::greaterEqual, VECTOR16_LOOP);
9515    addq(length, 16);
9516    jcc(Assembler::equal, SAME_TILL_END);
9517    //falling through if less than 16 bytes left
9518  }
9519
9520  bind(VECTOR8_TAIL);
9521  cmpq(length, 8);
9522  jccb(Assembler::less, VECTOR4_TAIL);
9523  bind(VECTOR8_LOOP);
9524  movq(tmp1, Address(obja, result));
9525  movq(tmp2, Address(objb, result));
9526  xorq(tmp1, tmp2);
9527  testq(tmp1, tmp1);
9528  jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
9529  addq(result, 8);
9530  subq(length, 8);
9531  jcc(Assembler::equal, SAME_TILL_END);
9532  //falling through if less than 8 bytes left
9533
9534  bind(VECTOR4_TAIL);
9535  cmpq(length, 4);
9536  jccb(Assembler::less, BYTES_TAIL);
9537  bind(VECTOR4_LOOP);
9538  movl(tmp1, Address(obja, result));
9539  xorl(tmp1, Address(objb, result));
9540  testl(tmp1, tmp1);
9541  jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
9542  addq(result, 4);
9543  subq(length, 4);
9544  jcc(Assembler::equal, SAME_TILL_END);
9545  //falling through if less than 4 bytes left
9546
9547  bind(BYTES_TAIL);
9548  bind(BYTES_LOOP);
9549  load_unsigned_byte(tmp1, Address(obja, result));
9550  load_unsigned_byte(tmp2, Address(objb, result));
9551  xorl(tmp1, tmp2);
9552  testl(tmp1, tmp1);
9553  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9554  decq(length);
9555  jccb(Assembler::zero, SAME_TILL_END);
9556  incq(result);
9557  load_unsigned_byte(tmp1, Address(obja, result));
9558  load_unsigned_byte(tmp2, Address(objb, result));
9559  xorl(tmp1, tmp2);
9560  testl(tmp1, tmp1);
9561  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9562  decq(length);
9563  jccb(Assembler::zero, SAME_TILL_END);
9564  incq(result);
9565  load_unsigned_byte(tmp1, Address(obja, result));
9566  load_unsigned_byte(tmp2, Address(objb, result));
9567  xorl(tmp1, tmp2);
9568  testl(tmp1, tmp1);
9569  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9570  jmpb(SAME_TILL_END);
9571
9572  if (UseAVX >= 2) {
9573    bind(VECTOR32_NOT_EQUAL);
9574    vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
9575    vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
9576    vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
9577    vpmovmskb(tmp1, rymm0);
9578    bsfq(tmp1, tmp1);
9579    addq(result, tmp1);
9580    shrq(result);
9581    jmpb(DONE);
9582  }
9583
9584  bind(VECTOR16_NOT_EQUAL);
9585  if (UseAVX >= 2) {
9586    vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
9587    vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
9588    pxor(rymm0, rymm2);
9589  } else {
9590    pcmpeqb(rymm2, rymm2);
9591    pxor(rymm0, rymm1);
9592    pcmpeqb(rymm0, rymm1);
9593    pxor(rymm0, rymm2);
9594  }
9595  pmovmskb(tmp1, rymm0);
9596  bsfq(tmp1, tmp1);
9597  addq(result, tmp1);
9598  shrq(result);
9599  jmpb(DONE);
9600
9601  bind(VECTOR8_NOT_EQUAL);
9602  bind(VECTOR4_NOT_EQUAL);
9603  bsfq(tmp1, tmp1);
9604  shrq(tmp1, 3);
9605  addq(result, tmp1);
9606  bind(BYTES_NOT_EQUAL);
9607  shrq(result);
9608  jmpb(DONE);
9609
9610  bind(SAME_TILL_END);
9611  mov64(result, -1);
9612
9613  bind(DONE);
9614}
9615
9616//Helper functions for square_to_len()
9617
9618/**
9619 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
9620 * Preserves x and z and modifies rest of the registers.
9621 */
9622void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9623  // Perform square and right shift by 1
9624  // Handle odd xlen case first, then for even xlen do the following
9625  // jlong carry = 0;
9626  // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
9627  //     huge_128 product = x[j:j+1] * x[j:j+1];
9628  //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
9629  //     z[i+2:i+3] = (jlong)(product >>> 1);
9630  //     carry = (jlong)product;
9631  // }
9632
9633  xorq(tmp5, tmp5);     // carry
9634  xorq(rdxReg, rdxReg);
9635  xorl(tmp1, tmp1);     // index for x
9636  xorl(tmp4, tmp4);     // index for z
9637
9638  Label L_first_loop, L_first_loop_exit;
9639
9640  testl(xlen, 1);
9641  jccb(Assembler::zero, L_first_loop); //jump if xlen is even
9642
9643  // Square and right shift by 1 the odd element using 32 bit multiply
9644  movl(raxReg, Address(x, tmp1, Address::times_4, 0));
9645  imulq(raxReg, raxReg);
9646  shrq(raxReg, 1);
9647  adcq(tmp5, 0);
9648  movq(Address(z, tmp4, Address::times_4, 0), raxReg);
9649  incrementl(tmp1);
9650  addl(tmp4, 2);
9651
9652  // Square and  right shift by 1 the rest using 64 bit multiply
9653  bind(L_first_loop);
9654  cmpptr(tmp1, xlen);
9655  jccb(Assembler::equal, L_first_loop_exit);
9656
9657  // Square
9658  movq(raxReg, Address(x, tmp1, Address::times_4,  0));
9659  rorq(raxReg, 32);    // convert big-endian to little-endian
9660  mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
9661
9662  // Right shift by 1 and save carry
9663  shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
9664  rcrq(rdxReg, 1);
9665  rcrq(raxReg, 1);
9666  adcq(tmp5, 0);
9667
9668  // Store result in z
9669  movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
9670  movq(Address(z, tmp4, Address::times_4, 8), raxReg);
9671
9672  // Update indices for x and z
9673  addl(tmp1, 2);
9674  addl(tmp4, 4);
9675  jmp(L_first_loop);
9676
9677  bind(L_first_loop_exit);
9678}
9679
9680
9681/**
9682 * Perform the following multiply add operation using BMI2 instructions
9683 * carry:sum = sum + op1*op2 + carry
9684 * op2 should be in rdx
9685 * op2 is preserved, all other registers are modified
9686 */
9687void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
9688  // assert op2 is rdx
9689  mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
9690  addq(sum, carry);
9691  adcq(tmp2, 0);
9692  addq(sum, op1);
9693  adcq(tmp2, 0);
9694  movq(carry, tmp2);
9695}
9696
9697/**
9698 * Perform the following multiply add operation:
9699 * carry:sum = sum + op1*op2 + carry
9700 * Preserves op1, op2 and modifies rest of registers
9701 */
9702void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
9703  // rdx:rax = op1 * op2
9704  movq(raxReg, op2);
9705  mulq(op1);
9706
9707  //  rdx:rax = sum + carry + rdx:rax
9708  addq(sum, carry);
9709  adcq(rdxReg, 0);
9710  addq(sum, raxReg);
9711  adcq(rdxReg, 0);
9712
9713  // carry:sum = rdx:sum
9714  movq(carry, rdxReg);
9715}
9716
9717/**
9718 * Add 64 bit long carry into z[] with carry propogation.
9719 * Preserves z and carry register values and modifies rest of registers.
9720 *
9721 */
9722void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
9723  Label L_fourth_loop, L_fourth_loop_exit;
9724
9725  movl(tmp1, 1);
9726  subl(zlen, 2);
9727  addq(Address(z, zlen, Address::times_4, 0), carry);
9728
9729  bind(L_fourth_loop);
9730  jccb(Assembler::carryClear, L_fourth_loop_exit);
9731  subl(zlen, 2);
9732  jccb(Assembler::negative, L_fourth_loop_exit);
9733  addq(Address(z, zlen, Address::times_4, 0), tmp1);
9734  jmp(L_fourth_loop);
9735  bind(L_fourth_loop_exit);
9736}
9737
9738/**
9739 * Shift z[] left by 1 bit.
9740 * Preserves x, len, z and zlen registers and modifies rest of the registers.
9741 *
9742 */
9743void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
9744
9745  Label L_fifth_loop, L_fifth_loop_exit;
9746
9747  // Fifth loop
9748  // Perform primitiveLeftShift(z, zlen, 1)
9749
9750  const Register prev_carry = tmp1;
9751  const Register new_carry = tmp4;
9752  const Register value = tmp2;
9753  const Register zidx = tmp3;
9754
9755  // int zidx, carry;
9756  // long value;
9757  // carry = 0;
9758  // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
9759  //    (carry:value)  = (z[i] << 1) | carry ;
9760  //    z[i] = value;
9761  // }
9762
9763  movl(zidx, zlen);
9764  xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
9765
9766  bind(L_fifth_loop);
9767  decl(zidx);  // Use decl to preserve carry flag
9768  decl(zidx);
9769  jccb(Assembler::negative, L_fifth_loop_exit);
9770
9771  if (UseBMI2Instructions) {
9772     movq(value, Address(z, zidx, Address::times_4, 0));
9773     rclq(value, 1);
9774     rorxq(value, value, 32);
9775     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
9776  }
9777  else {
9778    // clear new_carry
9779    xorl(new_carry, new_carry);
9780
9781    // Shift z[i] by 1, or in previous carry and save new carry
9782    movq(value, Address(z, zidx, Address::times_4, 0));
9783    shlq(value, 1);
9784    adcl(new_carry, 0);
9785
9786    orq(value, prev_carry);
9787    rorq(value, 0x20);
9788    movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
9789
9790    // Set previous carry = new carry
9791    movl(prev_carry, new_carry);
9792  }
9793  jmp(L_fifth_loop);
9794
9795  bind(L_fifth_loop_exit);
9796}
9797
9798
9799/**
9800 * Code for BigInteger::squareToLen() intrinsic
9801 *
9802 * rdi: x
9803 * rsi: len
9804 * r8:  z
9805 * rcx: zlen
9806 * r12: tmp1
9807 * r13: tmp2
9808 * r14: tmp3
9809 * r15: tmp4
9810 * rbx: tmp5
9811 *
9812 */
9813void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9814
9815  Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
9816  push(tmp1);
9817  push(tmp2);
9818  push(tmp3);
9819  push(tmp4);
9820  push(tmp5);
9821
9822  // First loop
9823  // Store the squares, right shifted one bit (i.e., divided by 2).
9824  square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
9825
9826  // Add in off-diagonal sums.
9827  //
9828  // Second, third (nested) and fourth loops.
9829  // zlen +=2;
9830  // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
9831  //    carry = 0;
9832  //    long op2 = x[xidx:xidx+1];
9833  //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
9834  //       k -= 2;
9835  //       long op1 = x[j:j+1];
9836  //       long sum = z[k:k+1];
9837  //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
9838  //       z[k:k+1] = sum;
9839  //    }
9840  //    add_one_64(z, k, carry, tmp_regs);
9841  // }
9842
9843  const Register carry = tmp5;
9844  const Register sum = tmp3;
9845  const Register op1 = tmp4;
9846  Register op2 = tmp2;
9847
9848  push(zlen);
9849  push(len);
9850  addl(zlen,2);
9851  bind(L_second_loop);
9852  xorq(carry, carry);
9853  subl(zlen, 4);
9854  subl(len, 2);
9855  push(zlen);
9856  push(len);
9857  cmpl(len, 0);
9858  jccb(Assembler::lessEqual, L_second_loop_exit);
9859
9860  // Multiply an array by one 64 bit long.
9861  if (UseBMI2Instructions) {
9862    op2 = rdxReg;
9863    movq(op2, Address(x, len, Address::times_4,  0));
9864    rorxq(op2, op2, 32);
9865  }
9866  else {
9867    movq(op2, Address(x, len, Address::times_4,  0));
9868    rorq(op2, 32);
9869  }
9870
9871  bind(L_third_loop);
9872  decrementl(len);
9873  jccb(Assembler::negative, L_third_loop_exit);
9874  decrementl(len);
9875  jccb(Assembler::negative, L_last_x);
9876
9877  movq(op1, Address(x, len, Address::times_4,  0));
9878  rorq(op1, 32);
9879
9880  bind(L_multiply);
9881  subl(zlen, 2);
9882  movq(sum, Address(z, zlen, Address::times_4,  0));
9883
9884  // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
9885  if (UseBMI2Instructions) {
9886    multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
9887  }
9888  else {
9889    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9890  }
9891
9892  movq(Address(z, zlen, Address::times_4, 0), sum);
9893
9894  jmp(L_third_loop);
9895  bind(L_third_loop_exit);
9896
9897  // Fourth loop
9898  // Add 64 bit long carry into z with carry propogation.
9899  // Uses offsetted zlen.
9900  add_one_64(z, zlen, carry, tmp1);
9901
9902  pop(len);
9903  pop(zlen);
9904  jmp(L_second_loop);
9905
9906  // Next infrequent code is moved outside loops.
9907  bind(L_last_x);
9908  movl(op1, Address(x, 0));
9909  jmp(L_multiply);
9910
9911  bind(L_second_loop_exit);
9912  pop(len);
9913  pop(zlen);
9914  pop(len);
9915  pop(zlen);
9916
9917  // Fifth loop
9918  // Shift z left 1 bit.
9919  lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
9920
9921  // z[zlen-1] |= x[len-1] & 1;
9922  movl(tmp3, Address(x, len, Address::times_4, -4));
9923  andl(tmp3, 1);
9924  orl(Address(z, zlen, Address::times_4,  -4), tmp3);
9925
9926  pop(tmp5);
9927  pop(tmp4);
9928  pop(tmp3);
9929  pop(tmp2);
9930  pop(tmp1);
9931}
9932
9933/**
9934 * Helper function for mul_add()
9935 * Multiply the in[] by int k and add to out[] starting at offset offs using
9936 * 128 bit by 32 bit multiply and return the carry in tmp5.
9937 * Only quad int aligned length of in[] is operated on in this function.
9938 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
9939 * This function preserves out, in and k registers.
9940 * len and offset point to the appropriate index in "in" & "out" correspondingly
9941 * tmp5 has the carry.
9942 * other registers are temporary and are modified.
9943 *
9944 */
9945void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
9946  Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
9947  Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9948
9949  Label L_first_loop, L_first_loop_exit;
9950
9951  movl(tmp1, len);
9952  shrl(tmp1, 2);
9953
9954  bind(L_first_loop);
9955  subl(tmp1, 1);
9956  jccb(Assembler::negative, L_first_loop_exit);
9957
9958  subl(len, 4);
9959  subl(offset, 4);
9960
9961  Register op2 = tmp2;
9962  const Register sum = tmp3;
9963  const Register op1 = tmp4;
9964  const Register carry = tmp5;
9965
9966  if (UseBMI2Instructions) {
9967    op2 = rdxReg;
9968  }
9969
9970  movq(op1, Address(in, len, Address::times_4,  8));
9971  rorq(op1, 32);
9972  movq(sum, Address(out, offset, Address::times_4,  8));
9973  rorq(sum, 32);
9974  if (UseBMI2Instructions) {
9975    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9976  }
9977  else {
9978    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9979  }
9980  // Store back in big endian from little endian
9981  rorq(sum, 0x20);
9982  movq(Address(out, offset, Address::times_4,  8), sum);
9983
9984  movq(op1, Address(in, len, Address::times_4,  0));
9985  rorq(op1, 32);
9986  movq(sum, Address(out, offset, Address::times_4,  0));
9987  rorq(sum, 32);
9988  if (UseBMI2Instructions) {
9989    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9990  }
9991  else {
9992    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9993  }
9994  // Store back in big endian from little endian
9995  rorq(sum, 0x20);
9996  movq(Address(out, offset, Address::times_4,  0), sum);
9997
9998  jmp(L_first_loop);
9999  bind(L_first_loop_exit);
10000}
10001
10002/**
10003 * Code for BigInteger::mulAdd() intrinsic
10004 *
10005 * rdi: out
10006 * rsi: in
10007 * r11: offs (out.length - offset)
10008 * rcx: len
10009 * r8:  k
10010 * r12: tmp1
10011 * r13: tmp2
10012 * r14: tmp3
10013 * r15: tmp4
10014 * rbx: tmp5
10015 * Multiply the in[] by word k and add to out[], return the carry in rax
10016 */
10017void MacroAssembler::mul_add(Register out, Register in, Register offs,
10018   Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
10019   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
10020
10021  Label L_carry, L_last_in, L_done;
10022
10023// carry = 0;
10024// for (int j=len-1; j >= 0; j--) {
10025//    long product = (in[j] & LONG_MASK) * kLong +
10026//                   (out[offs] & LONG_MASK) + carry;
10027//    out[offs--] = (int)product;
10028//    carry = product >>> 32;
10029// }
10030//
10031  push(tmp1);
10032  push(tmp2);
10033  push(tmp3);
10034  push(tmp4);
10035  push(tmp5);
10036
10037  Register op2 = tmp2;
10038  const Register sum = tmp3;
10039  const Register op1 = tmp4;
10040  const Register carry =  tmp5;
10041
10042  if (UseBMI2Instructions) {
10043    op2 = rdxReg;
10044    movl(op2, k);
10045  }
10046  else {
10047    movl(op2, k);
10048  }
10049
10050  xorq(carry, carry);
10051
10052  //First loop
10053
10054  //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
10055  //The carry is in tmp5
10056  mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
10057
10058  //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
10059  decrementl(len);
10060  jccb(Assembler::negative, L_carry);
10061  decrementl(len);
10062  jccb(Assembler::negative, L_last_in);
10063
10064  movq(op1, Address(in, len, Address::times_4,  0));
10065  rorq(op1, 32);
10066
10067  subl(offs, 2);
10068  movq(sum, Address(out, offs, Address::times_4,  0));
10069  rorq(sum, 32);
10070
10071  if (UseBMI2Instructions) {
10072    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
10073  }
10074  else {
10075    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
10076  }
10077
10078  // Store back in big endian from little endian
10079  rorq(sum, 0x20);
10080  movq(Address(out, offs, Address::times_4,  0), sum);
10081
10082  testl(len, len);
10083  jccb(Assembler::zero, L_carry);
10084
10085  //Multiply the last in[] entry, if any
10086  bind(L_last_in);
10087  movl(op1, Address(in, 0));
10088  movl(sum, Address(out, offs, Address::times_4,  -4));
10089
10090  movl(raxReg, k);
10091  mull(op1); //tmp4 * eax -> edx:eax
10092  addl(sum, carry);
10093  adcl(rdxReg, 0);
10094  addl(sum, raxReg);
10095  adcl(rdxReg, 0);
10096  movl(carry, rdxReg);
10097
10098  movl(Address(out, offs, Address::times_4,  -4), sum);
10099
10100  bind(L_carry);
10101  //return tmp5/carry as carry in rax
10102  movl(rax, carry);
10103
10104  bind(L_done);
10105  pop(tmp5);
10106  pop(tmp4);
10107  pop(tmp3);
10108  pop(tmp2);
10109  pop(tmp1);
10110}
10111#endif
10112
10113/**
10114 * Emits code to update CRC-32 with a byte value according to constants in table
10115 *
10116 * @param [in,out]crc   Register containing the crc.
10117 * @param [in]val       Register containing the byte to fold into the CRC.
10118 * @param [in]table     Register containing the table of crc constants.
10119 *
10120 * uint32_t crc;
10121 * val = crc_table[(val ^ crc) & 0xFF];
10122 * crc = val ^ (crc >> 8);
10123 *
10124 */
10125void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
10126  xorl(val, crc);
10127  andl(val, 0xFF);
10128  shrl(crc, 8); // unsigned shift
10129  xorl(crc, Address(table, val, Address::times_4, 0));
10130}
10131
10132/**
10133 * Fold 128-bit data chunk
10134 */
10135void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
10136  if (UseAVX > 0) {
10137    vpclmulhdq(xtmp, xK, xcrc); // [123:64]
10138    vpclmulldq(xcrc, xK, xcrc); // [63:0]
10139    vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
10140    pxor(xcrc, xtmp);
10141  } else {
10142    movdqa(xtmp, xcrc);
10143    pclmulhdq(xtmp, xK);   // [123:64]
10144    pclmulldq(xcrc, xK);   // [63:0]
10145    pxor(xcrc, xtmp);
10146    movdqu(xtmp, Address(buf, offset));
10147    pxor(xcrc, xtmp);
10148  }
10149}
10150
10151void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
10152  if (UseAVX > 0) {
10153    vpclmulhdq(xtmp, xK, xcrc);
10154    vpclmulldq(xcrc, xK, xcrc);
10155    pxor(xcrc, xbuf);
10156    pxor(xcrc, xtmp);
10157  } else {
10158    movdqa(xtmp, xcrc);
10159    pclmulhdq(xtmp, xK);
10160    pclmulldq(xcrc, xK);
10161    pxor(xcrc, xbuf);
10162    pxor(xcrc, xtmp);
10163  }
10164}
10165
10166/**
10167 * 8-bit folds to compute 32-bit CRC
10168 *
10169 * uint64_t xcrc;
10170 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
10171 */
10172void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
10173  movdl(tmp, xcrc);
10174  andl(tmp, 0xFF);
10175  movdl(xtmp, Address(table, tmp, Address::times_4, 0));
10176  psrldq(xcrc, 1); // unsigned shift one byte
10177  pxor(xcrc, xtmp);
10178}
10179
10180/**
10181 * uint32_t crc;
10182 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
10183 */
10184void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
10185  movl(tmp, crc);
10186  andl(tmp, 0xFF);
10187  shrl(crc, 8);
10188  xorl(crc, Address(table, tmp, Address::times_4, 0));
10189}
10190
10191/**
10192 * @param crc   register containing existing CRC (32-bit)
10193 * @param buf   register pointing to input byte buffer (byte*)
10194 * @param len   register containing number of bytes
10195 * @param table register that will contain address of CRC table
10196 * @param tmp   scratch register
10197 */
10198void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
10199  assert_different_registers(crc, buf, len, table, tmp, rax);
10200
10201  Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
10202  Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
10203
10204  // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
10205  // context for the registers used, where all instructions below are using 128-bit mode
10206  // On EVEX without VL and BW, these instructions will all be AVX.
10207  if (VM_Version::supports_avx512vlbw()) {
10208    movl(tmp, 0xffff);
10209    kmovwl(k1, tmp);
10210  }
10211
10212  lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
10213  notl(crc); // ~crc
10214  cmpl(len, 16);
10215  jcc(Assembler::less, L_tail);
10216
10217  // Align buffer to 16 bytes
10218  movl(tmp, buf);
10219  andl(tmp, 0xF);
10220  jccb(Assembler::zero, L_aligned);
10221  subl(tmp,  16);
10222  addl(len, tmp);
10223
10224  align(4);
10225  BIND(L_align_loop);
10226  movsbl(rax, Address(buf, 0)); // load byte with sign extension
10227  update_byte_crc32(crc, rax, table);
10228  increment(buf);
10229  incrementl(tmp);
10230  jccb(Assembler::less, L_align_loop);
10231
10232  BIND(L_aligned);
10233  movl(tmp, len); // save
10234  shrl(len, 4);
10235  jcc(Assembler::zero, L_tail_restore);
10236
10237  // Fold crc into first bytes of vector
10238  movdqa(xmm1, Address(buf, 0));
10239  movdl(rax, xmm1);
10240  xorl(crc, rax);
10241  if (VM_Version::supports_sse4_1()) {
10242    pinsrd(xmm1, crc, 0);
10243  } else {
10244    pinsrw(xmm1, crc, 0);
10245    shrl(crc, 16);
10246    pinsrw(xmm1, crc, 1);
10247  }
10248  addptr(buf, 16);
10249  subl(len, 4); // len > 0
10250  jcc(Assembler::less, L_fold_tail);
10251
10252  movdqa(xmm2, Address(buf,  0));
10253  movdqa(xmm3, Address(buf, 16));
10254  movdqa(xmm4, Address(buf, 32));
10255  addptr(buf, 48);
10256  subl(len, 3);
10257  jcc(Assembler::lessEqual, L_fold_512b);
10258
10259  // Fold total 512 bits of polynomial on each iteration,
10260  // 128 bits per each of 4 parallel streams.
10261  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
10262
10263  align(32);
10264  BIND(L_fold_512b_loop);
10265  fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
10266  fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
10267  fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
10268  fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
10269  addptr(buf, 64);
10270  subl(len, 4);
10271  jcc(Assembler::greater, L_fold_512b_loop);
10272
10273  // Fold 512 bits to 128 bits.
10274  BIND(L_fold_512b);
10275  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
10276  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
10277  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
10278  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
10279
10280  // Fold the rest of 128 bits data chunks
10281  BIND(L_fold_tail);
10282  addl(len, 3);
10283  jccb(Assembler::lessEqual, L_fold_128b);
10284  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
10285
10286  BIND(L_fold_tail_loop);
10287  fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
10288  addptr(buf, 16);
10289  decrementl(len);
10290  jccb(Assembler::greater, L_fold_tail_loop);
10291
10292  // Fold 128 bits in xmm1 down into 32 bits in crc register.
10293  BIND(L_fold_128b);
10294  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
10295  if (UseAVX > 0) {
10296    vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
10297    vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
10298    vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
10299  } else {
10300    movdqa(xmm2, xmm0);
10301    pclmulqdq(xmm2, xmm1, 0x1);
10302    movdqa(xmm3, xmm0);
10303    pand(xmm3, xmm2);
10304    pclmulqdq(xmm0, xmm3, 0x1);
10305  }
10306  psrldq(xmm1, 8);
10307  psrldq(xmm2, 4);
10308  pxor(xmm0, xmm1);
10309  pxor(xmm0, xmm2);
10310
10311  // 8 8-bit folds to compute 32-bit CRC.
10312  for (int j = 0; j < 4; j++) {
10313    fold_8bit_crc32(xmm0, table, xmm1, rax);
10314  }
10315  movdl(crc, xmm0); // mov 32 bits to general register
10316  for (int j = 0; j < 4; j++) {
10317    fold_8bit_crc32(crc, table, rax);
10318  }
10319
10320  BIND(L_tail_restore);
10321  movl(len, tmp); // restore
10322  BIND(L_tail);
10323  andl(len, 0xf);
10324  jccb(Assembler::zero, L_exit);
10325
10326  // Fold the rest of bytes
10327  align(4);
10328  BIND(L_tail_loop);
10329  movsbl(rax, Address(buf, 0)); // load byte with sign extension
10330  update_byte_crc32(crc, rax, table);
10331  increment(buf);
10332  decrementl(len);
10333  jccb(Assembler::greater, L_tail_loop);
10334
10335  BIND(L_exit);
10336  notl(crc); // ~c
10337}
10338
10339#ifdef _LP64
10340// S. Gueron / Information Processing Letters 112 (2012) 184
10341// Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
10342// Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
10343// Output: the 64-bit carry-less product of B * CONST
10344void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
10345                                     Register tmp1, Register tmp2, Register tmp3) {
10346  lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
10347  if (n > 0) {
10348    addq(tmp3, n * 256 * 8);
10349  }
10350  //    Q1 = TABLEExt[n][B & 0xFF];
10351  movl(tmp1, in);
10352  andl(tmp1, 0x000000FF);
10353  shll(tmp1, 3);
10354  addq(tmp1, tmp3);
10355  movq(tmp1, Address(tmp1, 0));
10356
10357  //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
10358  movl(tmp2, in);
10359  shrl(tmp2, 8);
10360  andl(tmp2, 0x000000FF);
10361  shll(tmp2, 3);
10362  addq(tmp2, tmp3);
10363  movq(tmp2, Address(tmp2, 0));
10364
10365  shlq(tmp2, 8);
10366  xorq(tmp1, tmp2);
10367
10368  //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
10369  movl(tmp2, in);
10370  shrl(tmp2, 16);
10371  andl(tmp2, 0x000000FF);
10372  shll(tmp2, 3);
10373  addq(tmp2, tmp3);
10374  movq(tmp2, Address(tmp2, 0));
10375
10376  shlq(tmp2, 16);
10377  xorq(tmp1, tmp2);
10378
10379  //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
10380  shrl(in, 24);
10381  andl(in, 0x000000FF);
10382  shll(in, 3);
10383  addq(in, tmp3);
10384  movq(in, Address(in, 0));
10385
10386  shlq(in, 24);
10387  xorq(in, tmp1);
10388  //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
10389}
10390
10391void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
10392                                      Register in_out,
10393                                      uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
10394                                      XMMRegister w_xtmp2,
10395                                      Register tmp1,
10396                                      Register n_tmp2, Register n_tmp3) {
10397  if (is_pclmulqdq_supported) {
10398    movdl(w_xtmp1, in_out); // modified blindly
10399
10400    movl(tmp1, const_or_pre_comp_const_index);
10401    movdl(w_xtmp2, tmp1);
10402    pclmulqdq(w_xtmp1, w_xtmp2, 0);
10403
10404    movdq(in_out, w_xtmp1);
10405  } else {
10406    crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
10407  }
10408}
10409
10410// Recombination Alternative 2: No bit-reflections
10411// T1 = (CRC_A * U1) << 1
10412// T2 = (CRC_B * U2) << 1
10413// C1 = T1 >> 32
10414// C2 = T2 >> 32
10415// T1 = T1 & 0xFFFFFFFF
10416// T2 = T2 & 0xFFFFFFFF
10417// T1 = CRC32(0, T1)
10418// T2 = CRC32(0, T2)
10419// C1 = C1 ^ T1
10420// C2 = C2 ^ T2
10421// CRC = C1 ^ C2 ^ CRC_C
10422void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
10423                                     XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10424                                     Register tmp1, Register tmp2,
10425                                     Register n_tmp3) {
10426  crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10427  crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10428  shlq(in_out, 1);
10429  movl(tmp1, in_out);
10430  shrq(in_out, 32);
10431  xorl(tmp2, tmp2);
10432  crc32(tmp2, tmp1, 4);
10433  xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
10434  shlq(in1, 1);
10435  movl(tmp1, in1);
10436  shrq(in1, 32);
10437  xorl(tmp2, tmp2);
10438  crc32(tmp2, tmp1, 4);
10439  xorl(in1, tmp2);
10440  xorl(in_out, in1);
10441  xorl(in_out, in2);
10442}
10443
10444// Set N to predefined value
10445// Subtract from a lenght of a buffer
10446// execute in a loop:
10447// CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
10448// for i = 1 to N do
10449//  CRC_A = CRC32(CRC_A, A[i])
10450//  CRC_B = CRC32(CRC_B, B[i])
10451//  CRC_C = CRC32(CRC_C, C[i])
10452// end for
10453// Recombine
10454void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
10455                                       Register in_out1, Register in_out2, Register in_out3,
10456                                       Register tmp1, Register tmp2, Register tmp3,
10457                                       XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10458                                       Register tmp4, Register tmp5,
10459                                       Register n_tmp6) {
10460  Label L_processPartitions;
10461  Label L_processPartition;
10462  Label L_exit;
10463
10464  bind(L_processPartitions);
10465  cmpl(in_out1, 3 * size);
10466  jcc(Assembler::less, L_exit);
10467    xorl(tmp1, tmp1);
10468    xorl(tmp2, tmp2);
10469    movq(tmp3, in_out2);
10470    addq(tmp3, size);
10471
10472    bind(L_processPartition);
10473      crc32(in_out3, Address(in_out2, 0), 8);
10474      crc32(tmp1, Address(in_out2, size), 8);
10475      crc32(tmp2, Address(in_out2, size * 2), 8);
10476      addq(in_out2, 8);
10477      cmpq(in_out2, tmp3);
10478      jcc(Assembler::less, L_processPartition);
10479    crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
10480            w_xtmp1, w_xtmp2, w_xtmp3,
10481            tmp4, tmp5,
10482            n_tmp6);
10483    addq(in_out2, 2 * size);
10484    subl(in_out1, 3 * size);
10485    jmp(L_processPartitions);
10486
10487  bind(L_exit);
10488}
10489#else
10490void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
10491                                     Register tmp1, Register tmp2, Register tmp3,
10492                                     XMMRegister xtmp1, XMMRegister xtmp2) {
10493  lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
10494  if (n > 0) {
10495    addl(tmp3, n * 256 * 8);
10496  }
10497  //    Q1 = TABLEExt[n][B & 0xFF];
10498  movl(tmp1, in_out);
10499  andl(tmp1, 0x000000FF);
10500  shll(tmp1, 3);
10501  addl(tmp1, tmp3);
10502  movq(xtmp1, Address(tmp1, 0));
10503
10504  //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
10505  movl(tmp2, in_out);
10506  shrl(tmp2, 8);
10507  andl(tmp2, 0x000000FF);
10508  shll(tmp2, 3);
10509  addl(tmp2, tmp3);
10510  movq(xtmp2, Address(tmp2, 0));
10511
10512  psllq(xtmp2, 8);
10513  pxor(xtmp1, xtmp2);
10514
10515  //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
10516  movl(tmp2, in_out);
10517  shrl(tmp2, 16);
10518  andl(tmp2, 0x000000FF);
10519  shll(tmp2, 3);
10520  addl(tmp2, tmp3);
10521  movq(xtmp2, Address(tmp2, 0));
10522
10523  psllq(xtmp2, 16);
10524  pxor(xtmp1, xtmp2);
10525
10526  //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
10527  shrl(in_out, 24);
10528  andl(in_out, 0x000000FF);
10529  shll(in_out, 3);
10530  addl(in_out, tmp3);
10531  movq(xtmp2, Address(in_out, 0));
10532
10533  psllq(xtmp2, 24);
10534  pxor(xtmp1, xtmp2); // Result in CXMM
10535  //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
10536}
10537
10538void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
10539                                      Register in_out,
10540                                      uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
10541                                      XMMRegister w_xtmp2,
10542                                      Register tmp1,
10543                                      Register n_tmp2, Register n_tmp3) {
10544  if (is_pclmulqdq_supported) {
10545    movdl(w_xtmp1, in_out);
10546
10547    movl(tmp1, const_or_pre_comp_const_index);
10548    movdl(w_xtmp2, tmp1);
10549    pclmulqdq(w_xtmp1, w_xtmp2, 0);
10550    // Keep result in XMM since GPR is 32 bit in length
10551  } else {
10552    crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
10553  }
10554}
10555
10556void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
10557                                     XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10558                                     Register tmp1, Register tmp2,
10559                                     Register n_tmp3) {
10560  crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10561  crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10562
10563  psllq(w_xtmp1, 1);
10564  movdl(tmp1, w_xtmp1);
10565  psrlq(w_xtmp1, 32);
10566  movdl(in_out, w_xtmp1);
10567
10568  xorl(tmp2, tmp2);
10569  crc32(tmp2, tmp1, 4);
10570  xorl(in_out, tmp2);
10571
10572  psllq(w_xtmp2, 1);
10573  movdl(tmp1, w_xtmp2);
10574  psrlq(w_xtmp2, 32);
10575  movdl(in1, w_xtmp2);
10576
10577  xorl(tmp2, tmp2);
10578  crc32(tmp2, tmp1, 4);
10579  xorl(in1, tmp2);
10580  xorl(in_out, in1);
10581  xorl(in_out, in2);
10582}
10583
10584void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
10585                                       Register in_out1, Register in_out2, Register in_out3,
10586                                       Register tmp1, Register tmp2, Register tmp3,
10587                                       XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10588                                       Register tmp4, Register tmp5,
10589                                       Register n_tmp6) {
10590  Label L_processPartitions;
10591  Label L_processPartition;
10592  Label L_exit;
10593
10594  bind(L_processPartitions);
10595  cmpl(in_out1, 3 * size);
10596  jcc(Assembler::less, L_exit);
10597    xorl(tmp1, tmp1);
10598    xorl(tmp2, tmp2);
10599    movl(tmp3, in_out2);
10600    addl(tmp3, size);
10601
10602    bind(L_processPartition);
10603      crc32(in_out3, Address(in_out2, 0), 4);
10604      crc32(tmp1, Address(in_out2, size), 4);
10605      crc32(tmp2, Address(in_out2, size*2), 4);
10606      crc32(in_out3, Address(in_out2, 0+4), 4);
10607      crc32(tmp1, Address(in_out2, size+4), 4);
10608      crc32(tmp2, Address(in_out2, size*2+4), 4);
10609      addl(in_out2, 8);
10610      cmpl(in_out2, tmp3);
10611      jcc(Assembler::less, L_processPartition);
10612
10613        push(tmp3);
10614        push(in_out1);
10615        push(in_out2);
10616        tmp4 = tmp3;
10617        tmp5 = in_out1;
10618        n_tmp6 = in_out2;
10619
10620      crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
10621            w_xtmp1, w_xtmp2, w_xtmp3,
10622            tmp4, tmp5,
10623            n_tmp6);
10624
10625        pop(in_out2);
10626        pop(in_out1);
10627        pop(tmp3);
10628
10629    addl(in_out2, 2 * size);
10630    subl(in_out1, 3 * size);
10631    jmp(L_processPartitions);
10632
10633  bind(L_exit);
10634}
10635#endif //LP64
10636
10637#ifdef _LP64
10638// Algorithm 2: Pipelined usage of the CRC32 instruction.
10639// Input: A buffer I of L bytes.
10640// Output: the CRC32C value of the buffer.
10641// Notations:
10642// Write L = 24N + r, with N = floor (L/24).
10643// r = L mod 24 (0 <= r < 24).
10644// Consider I as the concatenation of A|B|C|R, where A, B, C, each,
10645// N quadwords, and R consists of r bytes.
10646// A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
10647// B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
10648// C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
10649// if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
10650void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
10651                                          Register tmp1, Register tmp2, Register tmp3,
10652                                          Register tmp4, Register tmp5, Register tmp6,
10653                                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10654                                          bool is_pclmulqdq_supported) {
10655  uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
10656  Label L_wordByWord;
10657  Label L_byteByByteProlog;
10658  Label L_byteByByte;
10659  Label L_exit;
10660
10661  if (is_pclmulqdq_supported ) {
10662    const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
10663    const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
10664
10665    const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
10666    const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
10667
10668    const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
10669    const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
10670    assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
10671  } else {
10672    const_or_pre_comp_const_index[0] = 1;
10673    const_or_pre_comp_const_index[1] = 0;
10674
10675    const_or_pre_comp_const_index[2] = 3;
10676    const_or_pre_comp_const_index[3] = 2;
10677
10678    const_or_pre_comp_const_index[4] = 5;
10679    const_or_pre_comp_const_index[5] = 4;
10680   }
10681  crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
10682                    in2, in1, in_out,
10683                    tmp1, tmp2, tmp3,
10684                    w_xtmp1, w_xtmp2, w_xtmp3,
10685                    tmp4, tmp5,
10686                    tmp6);
10687  crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
10688                    in2, in1, in_out,
10689                    tmp1, tmp2, tmp3,
10690                    w_xtmp1, w_xtmp2, w_xtmp3,
10691                    tmp4, tmp5,
10692                    tmp6);
10693  crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
10694                    in2, in1, in_out,
10695                    tmp1, tmp2, tmp3,
10696                    w_xtmp1, w_xtmp2, w_xtmp3,
10697                    tmp4, tmp5,
10698                    tmp6);
10699  movl(tmp1, in2);
10700  andl(tmp1, 0x00000007);
10701  negl(tmp1);
10702  addl(tmp1, in2);
10703  addq(tmp1, in1);
10704
10705  BIND(L_wordByWord);
10706  cmpq(in1, tmp1);
10707  jcc(Assembler::greaterEqual, L_byteByByteProlog);
10708    crc32(in_out, Address(in1, 0), 4);
10709    addq(in1, 4);
10710    jmp(L_wordByWord);
10711
10712  BIND(L_byteByByteProlog);
10713  andl(in2, 0x00000007);
10714  movl(tmp2, 1);
10715
10716  BIND(L_byteByByte);
10717  cmpl(tmp2, in2);
10718  jccb(Assembler::greater, L_exit);
10719    crc32(in_out, Address(in1, 0), 1);
10720    incq(in1);
10721    incl(tmp2);
10722    jmp(L_byteByByte);
10723
10724  BIND(L_exit);
10725}
10726#else
10727void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
10728                                          Register tmp1, Register  tmp2, Register tmp3,
10729                                          Register tmp4, Register  tmp5, Register tmp6,
10730                                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10731                                          bool is_pclmulqdq_supported) {
10732  uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
10733  Label L_wordByWord;
10734  Label L_byteByByteProlog;
10735  Label L_byteByByte;
10736  Label L_exit;
10737
10738  if (is_pclmulqdq_supported) {
10739    const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
10740    const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
10741
10742    const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
10743    const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
10744
10745    const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
10746    const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
10747  } else {
10748    const_or_pre_comp_const_index[0] = 1;
10749    const_or_pre_comp_const_index[1] = 0;
10750
10751    const_or_pre_comp_const_index[2] = 3;
10752    const_or_pre_comp_const_index[3] = 2;
10753
10754    const_or_pre_comp_const_index[4] = 5;
10755    const_or_pre_comp_const_index[5] = 4;
10756  }
10757  crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
10758                    in2, in1, in_out,
10759                    tmp1, tmp2, tmp3,
10760                    w_xtmp1, w_xtmp2, w_xtmp3,
10761                    tmp4, tmp5,
10762                    tmp6);
10763  crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
10764                    in2, in1, in_out,
10765                    tmp1, tmp2, tmp3,
10766                    w_xtmp1, w_xtmp2, w_xtmp3,
10767                    tmp4, tmp5,
10768                    tmp6);
10769  crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
10770                    in2, in1, in_out,
10771                    tmp1, tmp2, tmp3,
10772                    w_xtmp1, w_xtmp2, w_xtmp3,
10773                    tmp4, tmp5,
10774                    tmp6);
10775  movl(tmp1, in2);
10776  andl(tmp1, 0x00000007);
10777  negl(tmp1);
10778  addl(tmp1, in2);
10779  addl(tmp1, in1);
10780
10781  BIND(L_wordByWord);
10782  cmpl(in1, tmp1);
10783  jcc(Assembler::greaterEqual, L_byteByByteProlog);
10784    crc32(in_out, Address(in1,0), 4);
10785    addl(in1, 4);
10786    jmp(L_wordByWord);
10787
10788  BIND(L_byteByByteProlog);
10789  andl(in2, 0x00000007);
10790  movl(tmp2, 1);
10791
10792  BIND(L_byteByByte);
10793  cmpl(tmp2, in2);
10794  jccb(Assembler::greater, L_exit);
10795    movb(tmp1, Address(in1, 0));
10796    crc32(in_out, tmp1, 1);
10797    incl(in1);
10798    incl(tmp2);
10799    jmp(L_byteByByte);
10800
10801  BIND(L_exit);
10802}
10803#endif // LP64
10804#undef BIND
10805#undef BLOCK_COMMENT
10806
10807// Compress char[] array to byte[].
10808//   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
10809//   @HotSpotIntrinsicCandidate
10810//   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
10811//     for (int i = 0; i < len; i++) {
10812//       int c = src[srcOff++];
10813//       if (c >>> 8 != 0) {
10814//         return 0;
10815//       }
10816//       dst[dstOff++] = (byte)c;
10817//     }
10818//     return len;
10819//   }
10820void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
10821  XMMRegister tmp1Reg, XMMRegister tmp2Reg,
10822  XMMRegister tmp3Reg, XMMRegister tmp4Reg,
10823  Register tmp5, Register result) {
10824  Label copy_chars_loop, return_length, return_zero, done, below_threshold;
10825
10826  // rsi: src
10827  // rdi: dst
10828  // rdx: len
10829  // rcx: tmp5
10830  // rax: result
10831
10832  // rsi holds start addr of source char[] to be compressed
10833  // rdi holds start addr of destination byte[]
10834  // rdx holds length
10835
10836  assert(len != result, "");
10837
10838  // save length for return
10839  push(len);
10840
10841  if ((UseAVX > 2) && // AVX512
10842    VM_Version::supports_avx512vlbw() &&
10843    VM_Version::supports_bmi2()) {
10844
10845    set_vector_masking();  // opening of the stub context for programming mask registers
10846
10847    Label copy_32_loop, copy_loop_tail, restore_k1_return_zero;
10848
10849    // alignement
10850    Label post_alignement;
10851
10852    // if length of the string is less than 16, handle it in an old fashioned
10853    // way
10854    testl(len, -32);
10855    jcc(Assembler::zero, below_threshold);
10856
10857    // First check whether a character is compressable ( <= 0xFF).
10858    // Create mask to test for Unicode chars inside zmm vector
10859    movl(result, 0x00FF);
10860    evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
10861
10862    // Save k1
10863    kmovql(k3, k1);
10864
10865    testl(len, -64);
10866    jcc(Assembler::zero, post_alignement);
10867
10868    movl(tmp5, dst);
10869    andl(tmp5, (32 - 1));
10870    negl(tmp5);
10871    andl(tmp5, (32 - 1));
10872
10873    // bail out when there is nothing to be done
10874    testl(tmp5, 0xFFFFFFFF);
10875    jcc(Assembler::zero, post_alignement);
10876
10877    // ~(~0 << len), where len is the # of remaining elements to process
10878    movl(result, 0xFFFFFFFF);
10879    shlxl(result, result, tmp5);
10880    notl(result);
10881    kmovdl(k1, result);
10882
10883    evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
10884    evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
10885    ktestd(k2, k1);
10886    jcc(Assembler::carryClear, restore_k1_return_zero);
10887
10888    evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
10889
10890    addptr(src, tmp5);
10891    addptr(src, tmp5);
10892    addptr(dst, tmp5);
10893    subl(len, tmp5);
10894
10895    bind(post_alignement);
10896    // end of alignement
10897
10898    movl(tmp5, len);
10899    andl(tmp5, (32 - 1));    // tail count (in chars)
10900    andl(len, ~(32 - 1));    // vector count (in chars)
10901    jcc(Assembler::zero, copy_loop_tail);
10902
10903    lea(src, Address(src, len, Address::times_2));
10904    lea(dst, Address(dst, len, Address::times_1));
10905    negptr(len);
10906
10907    bind(copy_32_loop);
10908    evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
10909    evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
10910    kortestdl(k2, k2);
10911    jcc(Assembler::carryClear, restore_k1_return_zero);
10912
10913    // All elements in current processed chunk are valid candidates for
10914    // compression. Write a truncated byte elements to the memory.
10915    evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
10916    addptr(len, 32);
10917    jcc(Assembler::notZero, copy_32_loop);
10918
10919    bind(copy_loop_tail);
10920    // bail out when there is nothing to be done
10921    testl(tmp5, 0xFFFFFFFF);
10922    // Restore k1
10923    kmovql(k1, k3);
10924    jcc(Assembler::zero, return_length);
10925
10926    movl(len, tmp5);
10927
10928    // ~(~0 << len), where len is the # of remaining elements to process
10929    movl(result, 0xFFFFFFFF);
10930    shlxl(result, result, len);
10931    notl(result);
10932
10933    kmovdl(k1, result);
10934
10935    evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
10936    evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
10937    ktestd(k2, k1);
10938    jcc(Assembler::carryClear, restore_k1_return_zero);
10939
10940    evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
10941    // Restore k1
10942    kmovql(k1, k3);
10943    jmp(return_length);
10944
10945    bind(restore_k1_return_zero);
10946    // Restore k1
10947    kmovql(k1, k3);
10948    jmp(return_zero);
10949
10950    clear_vector_masking();   // closing of the stub context for programming mask registers
10951  }
10952  if (UseSSE42Intrinsics) {
10953    Label copy_32_loop, copy_16, copy_tail;
10954
10955    bind(below_threshold);
10956
10957    movl(result, len);
10958
10959    movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
10960
10961    // vectored compression
10962    andl(len, 0xfffffff0);    // vector count (in chars)
10963    andl(result, 0x0000000f);    // tail count (in chars)
10964    testl(len, len);
10965    jccb(Assembler::zero, copy_16);
10966
10967    // compress 16 chars per iter
10968    movdl(tmp1Reg, tmp5);
10969    pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
10970    pxor(tmp4Reg, tmp4Reg);
10971
10972    lea(src, Address(src, len, Address::times_2));
10973    lea(dst, Address(dst, len, Address::times_1));
10974    negptr(len);
10975
10976    bind(copy_32_loop);
10977    movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
10978    por(tmp4Reg, tmp2Reg);
10979    movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
10980    por(tmp4Reg, tmp3Reg);
10981    ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
10982    jcc(Assembler::notZero, return_zero);
10983    packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
10984    movdqu(Address(dst, len, Address::times_1), tmp2Reg);
10985    addptr(len, 16);
10986    jcc(Assembler::notZero, copy_32_loop);
10987
10988    // compress next vector of 8 chars (if any)
10989    bind(copy_16);
10990    movl(len, result);
10991    andl(len, 0xfffffff8);    // vector count (in chars)
10992    andl(result, 0x00000007);    // tail count (in chars)
10993    testl(len, len);
10994    jccb(Assembler::zero, copy_tail);
10995
10996    movdl(tmp1Reg, tmp5);
10997    pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
10998    pxor(tmp3Reg, tmp3Reg);
10999
11000    movdqu(tmp2Reg, Address(src, 0));
11001    ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
11002    jccb(Assembler::notZero, return_zero);
11003    packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
11004    movq(Address(dst, 0), tmp2Reg);
11005    addptr(src, 16);
11006    addptr(dst, 8);
11007
11008    bind(copy_tail);
11009    movl(len, result);
11010  }
11011  // compress 1 char per iter
11012  testl(len, len);
11013  jccb(Assembler::zero, return_length);
11014  lea(src, Address(src, len, Address::times_2));
11015  lea(dst, Address(dst, len, Address::times_1));
11016  negptr(len);
11017
11018  bind(copy_chars_loop);
11019  load_unsigned_short(result, Address(src, len, Address::times_2));
11020  testl(result, 0xff00);      // check if Unicode char
11021  jccb(Assembler::notZero, return_zero);
11022  movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
11023  increment(len);
11024  jcc(Assembler::notZero, copy_chars_loop);
11025
11026  // if compression succeeded, return length
11027  bind(return_length);
11028  pop(result);
11029  jmpb(done);
11030
11031  // if compression failed, return 0
11032  bind(return_zero);
11033  xorl(result, result);
11034  addptr(rsp, wordSize);
11035
11036  bind(done);
11037}
11038
11039// Inflate byte[] array to char[].
11040//   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
11041//   @HotSpotIntrinsicCandidate
11042//   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
11043//     for (int i = 0; i < len; i++) {
11044//       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
11045//     }
11046//   }
11047void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
11048  XMMRegister tmp1, Register tmp2) {
11049  Label copy_chars_loop, done, below_threshold;
11050  // rsi: src
11051  // rdi: dst
11052  // rdx: len
11053  // rcx: tmp2
11054
11055  // rsi holds start addr of source byte[] to be inflated
11056  // rdi holds start addr of destination char[]
11057  // rdx holds length
11058  assert_different_registers(src, dst, len, tmp2);
11059
11060  if ((UseAVX > 2) && // AVX512
11061    VM_Version::supports_avx512vlbw() &&
11062    VM_Version::supports_bmi2()) {
11063
11064    set_vector_masking();  // opening of the stub context for programming mask registers
11065
11066    Label copy_32_loop, copy_tail;
11067    Register tmp3_aliased = len;
11068
11069    // if length of the string is less than 16, handle it in an old fashioned
11070    // way
11071    testl(len, -16);
11072    jcc(Assembler::zero, below_threshold);
11073
11074    // In order to use only one arithmetic operation for the main loop we use
11075    // this pre-calculation
11076    movl(tmp2, len);
11077    andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
11078    andl(len, -32);     // vector count
11079    jccb(Assembler::zero, copy_tail);
11080
11081    lea(src, Address(src, len, Address::times_1));
11082    lea(dst, Address(dst, len, Address::times_2));
11083    negptr(len);
11084
11085
11086    // inflate 32 chars per iter
11087    bind(copy_32_loop);
11088    vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
11089    evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
11090    addptr(len, 32);
11091    jcc(Assembler::notZero, copy_32_loop);
11092
11093    bind(copy_tail);
11094    // bail out when there is nothing to be done
11095    testl(tmp2, -1); // we don't destroy the contents of tmp2 here
11096    jcc(Assembler::zero, done);
11097
11098    // Save k1
11099    kmovql(k2, k1);
11100
11101    // ~(~0 << length), where length is the # of remaining elements to process
11102    movl(tmp3_aliased, -1);
11103    shlxl(tmp3_aliased, tmp3_aliased, tmp2);
11104    notl(tmp3_aliased);
11105    kmovdl(k1, tmp3_aliased);
11106    evpmovzxbw(tmp1, k1, Address(src, 0), Assembler::AVX_512bit);
11107    evmovdquw(Address(dst, 0), k1, tmp1, Assembler::AVX_512bit);
11108
11109    // Restore k1
11110    kmovql(k1, k2);
11111    jmp(done);
11112
11113    clear_vector_masking();   // closing of the stub context for programming mask registers
11114  }
11115  if (UseSSE42Intrinsics) {
11116    Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
11117
11118    movl(tmp2, len);
11119
11120    if (UseAVX > 1) {
11121      andl(tmp2, (16 - 1));
11122      andl(len, -16);
11123      jccb(Assembler::zero, copy_new_tail);
11124    } else {
11125      andl(tmp2, 0x00000007);   // tail count (in chars)
11126      andl(len, 0xfffffff8);    // vector count (in chars)
11127      jccb(Assembler::zero, copy_tail);
11128    }
11129
11130    // vectored inflation
11131    lea(src, Address(src, len, Address::times_1));
11132    lea(dst, Address(dst, len, Address::times_2));
11133    negptr(len);
11134
11135    if (UseAVX > 1) {
11136      bind(copy_16_loop);
11137      vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
11138      vmovdqu(Address(dst, len, Address::times_2), tmp1);
11139      addptr(len, 16);
11140      jcc(Assembler::notZero, copy_16_loop);
11141
11142      bind(below_threshold);
11143      bind(copy_new_tail);
11144      if ((UseAVX > 2) &&
11145        VM_Version::supports_avx512vlbw() &&
11146        VM_Version::supports_bmi2()) {
11147        movl(tmp2, len);
11148      } else {
11149        movl(len, tmp2);
11150      }
11151      andl(tmp2, 0x00000007);
11152      andl(len, 0xFFFFFFF8);
11153      jccb(Assembler::zero, copy_tail);
11154
11155      pmovzxbw(tmp1, Address(src, 0));
11156      movdqu(Address(dst, 0), tmp1);
11157      addptr(src, 8);
11158      addptr(dst, 2 * 8);
11159
11160      jmp(copy_tail, true);
11161    }
11162
11163    // inflate 8 chars per iter
11164    bind(copy_8_loop);
11165    pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
11166    movdqu(Address(dst, len, Address::times_2), tmp1);
11167    addptr(len, 8);
11168    jcc(Assembler::notZero, copy_8_loop);
11169
11170    bind(copy_tail);
11171    movl(len, tmp2);
11172
11173    cmpl(len, 4);
11174    jccb(Assembler::less, copy_bytes);
11175
11176    movdl(tmp1, Address(src, 0));  // load 4 byte chars
11177    pmovzxbw(tmp1, tmp1);
11178    movq(Address(dst, 0), tmp1);
11179    subptr(len, 4);
11180    addptr(src, 4);
11181    addptr(dst, 8);
11182
11183    bind(copy_bytes);
11184  }
11185  testl(len, len);
11186  jccb(Assembler::zero, done);
11187  lea(src, Address(src, len, Address::times_1));
11188  lea(dst, Address(dst, len, Address::times_2));
11189  negptr(len);
11190
11191  // inflate 1 char per iter
11192  bind(copy_chars_loop);
11193  load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
11194  movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
11195  increment(len);
11196  jcc(Assembler::notZero, copy_chars_loop);
11197
11198  bind(done);
11199}
11200
11201Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
11202  switch (cond) {
11203    // Note some conditions are synonyms for others
11204    case Assembler::zero:         return Assembler::notZero;
11205    case Assembler::notZero:      return Assembler::zero;
11206    case Assembler::less:         return Assembler::greaterEqual;
11207    case Assembler::lessEqual:    return Assembler::greater;
11208    case Assembler::greater:      return Assembler::lessEqual;
11209    case Assembler::greaterEqual: return Assembler::less;
11210    case Assembler::below:        return Assembler::aboveEqual;
11211    case Assembler::belowEqual:   return Assembler::above;
11212    case Assembler::above:        return Assembler::belowEqual;
11213    case Assembler::aboveEqual:   return Assembler::below;
11214    case Assembler::overflow:     return Assembler::noOverflow;
11215    case Assembler::noOverflow:   return Assembler::overflow;
11216    case Assembler::negative:     return Assembler::positive;
11217    case Assembler::positive:     return Assembler::negative;
11218    case Assembler::parity:       return Assembler::noParity;
11219    case Assembler::noParity:     return Assembler::parity;
11220  }
11221  ShouldNotReachHere(); return Assembler::overflow;
11222}
11223
11224SkipIfEqual::SkipIfEqual(
11225    MacroAssembler* masm, const bool* flag_addr, bool value) {
11226  _masm = masm;
11227  _masm->cmp8(ExternalAddress((address)flag_addr), value);
11228  _masm->jcc(Assembler::equal, _label);
11229}
11230
11231SkipIfEqual::~SkipIfEqual() {
11232  _masm->bind(_label);
11233}
11234
11235// 32-bit Windows has its own fast-path implementation
11236// of get_thread
11237#if !defined(WIN32) || defined(_LP64)
11238
11239// This is simply a call to Thread::current()
11240void MacroAssembler::get_thread(Register thread) {
11241  if (thread != rax) {
11242    push(rax);
11243  }
11244  LP64_ONLY(push(rdi);)
11245  LP64_ONLY(push(rsi);)
11246  push(rdx);
11247  push(rcx);
11248#ifdef _LP64
11249  push(r8);
11250  push(r9);
11251  push(r10);
11252  push(r11);
11253#endif
11254
11255  MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
11256
11257#ifdef _LP64
11258  pop(r11);
11259  pop(r10);
11260  pop(r9);
11261  pop(r8);
11262#endif
11263  pop(rcx);
11264  pop(rdx);
11265  LP64_ONLY(pop(rsi);)
11266  LP64_ONLY(pop(rdi);)
11267  if (thread != rax) {
11268    mov(thread, rax);
11269    pop(rax);
11270  }
11271}
11272
11273#endif
11274