macroAssembler_x86.cpp revision 7377:c42a0b8babb4
1/*
2 * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "asm/assembler.hpp"
27#include "asm/assembler.inline.hpp"
28#include "compiler/disassembler.hpp"
29#include "gc_interface/collectedHeap.inline.hpp"
30#include "interpreter/interpreter.hpp"
31#include "memory/cardTableModRefBS.hpp"
32#include "memory/resourceArea.hpp"
33#include "memory/universe.hpp"
34#include "prims/methodHandles.hpp"
35#include "runtime/biasedLocking.hpp"
36#include "runtime/interfaceSupport.hpp"
37#include "runtime/objectMonitor.hpp"
38#include "runtime/os.hpp"
39#include "runtime/sharedRuntime.hpp"
40#include "runtime/stubRoutines.hpp"
41#include "utilities/macros.hpp"
42#if INCLUDE_ALL_GCS
43#include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
44#include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
45#include "gc_implementation/g1/heapRegion.hpp"
46#endif // INCLUDE_ALL_GCS
47
48#ifdef PRODUCT
49#define BLOCK_COMMENT(str) /* nothing */
50#define STOP(error) stop(error)
51#else
52#define BLOCK_COMMENT(str) block_comment(str)
53#define STOP(error) block_comment(error); stop(error)
54#endif
55
56#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
57
58PRAGMA_FORMAT_MUTE_WARNINGS_FOR_GCC
59
60#ifdef ASSERT
61bool AbstractAssembler::pd_check_instruction_mark() { return true; }
62#endif
63
64static Assembler::Condition reverse[] = {
65    Assembler::noOverflow     /* overflow      = 0x0 */ ,
66    Assembler::overflow       /* noOverflow    = 0x1 */ ,
67    Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
68    Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
69    Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
70    Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
71    Assembler::above          /* belowEqual    = 0x6 */ ,
72    Assembler::belowEqual     /* above         = 0x7 */ ,
73    Assembler::positive       /* negative      = 0x8 */ ,
74    Assembler::negative       /* positive      = 0x9 */ ,
75    Assembler::noParity       /* parity        = 0xa */ ,
76    Assembler::parity         /* noParity      = 0xb */ ,
77    Assembler::greaterEqual   /* less          = 0xc */ ,
78    Assembler::less           /* greaterEqual  = 0xd */ ,
79    Assembler::greater        /* lessEqual     = 0xe */ ,
80    Assembler::lessEqual      /* greater       = 0xf, */
81
82};
83
84
85// Implementation of MacroAssembler
86
87// First all the versions that have distinct versions depending on 32/64 bit
88// Unless the difference is trivial (1 line or so).
89
90#ifndef _LP64
91
92// 32bit versions
93
94Address MacroAssembler::as_Address(AddressLiteral adr) {
95  return Address(adr.target(), adr.rspec());
96}
97
98Address MacroAssembler::as_Address(ArrayAddress adr) {
99  return Address::make_array(adr);
100}
101
102void MacroAssembler::call_VM_leaf_base(address entry_point,
103                                       int number_of_arguments) {
104  call(RuntimeAddress(entry_point));
105  increment(rsp, number_of_arguments * wordSize);
106}
107
108void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
109  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
110}
111
112void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
113  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
114}
115
116void MacroAssembler::cmpoop(Address src1, jobject obj) {
117  cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
118}
119
120void MacroAssembler::cmpoop(Register src1, jobject obj) {
121  cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
122}
123
124void MacroAssembler::extend_sign(Register hi, Register lo) {
125  // According to Intel Doc. AP-526, "Integer Divide", p.18.
126  if (VM_Version::is_P6() && hi == rdx && lo == rax) {
127    cdql();
128  } else {
129    movl(hi, lo);
130    sarl(hi, 31);
131  }
132}
133
134void MacroAssembler::jC2(Register tmp, Label& L) {
135  // set parity bit if FPU flag C2 is set (via rax)
136  save_rax(tmp);
137  fwait(); fnstsw_ax();
138  sahf();
139  restore_rax(tmp);
140  // branch
141  jcc(Assembler::parity, L);
142}
143
144void MacroAssembler::jnC2(Register tmp, Label& L) {
145  // set parity bit if FPU flag C2 is set (via rax)
146  save_rax(tmp);
147  fwait(); fnstsw_ax();
148  sahf();
149  restore_rax(tmp);
150  // branch
151  jcc(Assembler::noParity, L);
152}
153
154// 32bit can do a case table jump in one instruction but we no longer allow the base
155// to be installed in the Address class
156void MacroAssembler::jump(ArrayAddress entry) {
157  jmp(as_Address(entry));
158}
159
160// Note: y_lo will be destroyed
161void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
162  // Long compare for Java (semantics as described in JVM spec.)
163  Label high, low, done;
164
165  cmpl(x_hi, y_hi);
166  jcc(Assembler::less, low);
167  jcc(Assembler::greater, high);
168  // x_hi is the return register
169  xorl(x_hi, x_hi);
170  cmpl(x_lo, y_lo);
171  jcc(Assembler::below, low);
172  jcc(Assembler::equal, done);
173
174  bind(high);
175  xorl(x_hi, x_hi);
176  increment(x_hi);
177  jmp(done);
178
179  bind(low);
180  xorl(x_hi, x_hi);
181  decrementl(x_hi);
182
183  bind(done);
184}
185
186void MacroAssembler::lea(Register dst, AddressLiteral src) {
187    mov_literal32(dst, (int32_t)src.target(), src.rspec());
188}
189
190void MacroAssembler::lea(Address dst, AddressLiteral adr) {
191  // leal(dst, as_Address(adr));
192  // see note in movl as to why we must use a move
193  mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
194}
195
196void MacroAssembler::leave() {
197  mov(rsp, rbp);
198  pop(rbp);
199}
200
201void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
202  // Multiplication of two Java long values stored on the stack
203  // as illustrated below. Result is in rdx:rax.
204  //
205  // rsp ---> [  ??  ] \               \
206  //            ....    | y_rsp_offset  |
207  //          [ y_lo ] /  (in bytes)    | x_rsp_offset
208  //          [ y_hi ]                  | (in bytes)
209  //            ....                    |
210  //          [ x_lo ]                 /
211  //          [ x_hi ]
212  //            ....
213  //
214  // Basic idea: lo(result) = lo(x_lo * y_lo)
215  //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
216  Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
217  Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
218  Label quick;
219  // load x_hi, y_hi and check if quick
220  // multiplication is possible
221  movl(rbx, x_hi);
222  movl(rcx, y_hi);
223  movl(rax, rbx);
224  orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
225  jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
226  // do full multiplication
227  // 1st step
228  mull(y_lo);                                    // x_hi * y_lo
229  movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
230  // 2nd step
231  movl(rax, x_lo);
232  mull(rcx);                                     // x_lo * y_hi
233  addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
234  // 3rd step
235  bind(quick);                                   // note: rbx, = 0 if quick multiply!
236  movl(rax, x_lo);
237  mull(y_lo);                                    // x_lo * y_lo
238  addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
239}
240
241void MacroAssembler::lneg(Register hi, Register lo) {
242  negl(lo);
243  adcl(hi, 0);
244  negl(hi);
245}
246
247void MacroAssembler::lshl(Register hi, Register lo) {
248  // Java shift left long support (semantics as described in JVM spec., p.305)
249  // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
250  // shift value is in rcx !
251  assert(hi != rcx, "must not use rcx");
252  assert(lo != rcx, "must not use rcx");
253  const Register s = rcx;                        // shift count
254  const int      n = BitsPerWord;
255  Label L;
256  andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
257  cmpl(s, n);                                    // if (s < n)
258  jcc(Assembler::less, L);                       // else (s >= n)
259  movl(hi, lo);                                  // x := x << n
260  xorl(lo, lo);
261  // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
262  bind(L);                                       // s (mod n) < n
263  shldl(hi, lo);                                 // x := x << s
264  shll(lo);
265}
266
267
268void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
269  // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
270  // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
271  assert(hi != rcx, "must not use rcx");
272  assert(lo != rcx, "must not use rcx");
273  const Register s = rcx;                        // shift count
274  const int      n = BitsPerWord;
275  Label L;
276  andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
277  cmpl(s, n);                                    // if (s < n)
278  jcc(Assembler::less, L);                       // else (s >= n)
279  movl(lo, hi);                                  // x := x >> n
280  if (sign_extension) sarl(hi, 31);
281  else                xorl(hi, hi);
282  // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
283  bind(L);                                       // s (mod n) < n
284  shrdl(lo, hi);                                 // x := x >> s
285  if (sign_extension) sarl(hi);
286  else                shrl(hi);
287}
288
289void MacroAssembler::movoop(Register dst, jobject obj) {
290  mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
291}
292
293void MacroAssembler::movoop(Address dst, jobject obj) {
294  mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
295}
296
297void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
298  mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
299}
300
301void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
302  mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
303}
304
305void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
306  // scratch register is not used,
307  // it is defined to match parameters of 64-bit version of this method.
308  if (src.is_lval()) {
309    mov_literal32(dst, (intptr_t)src.target(), src.rspec());
310  } else {
311    movl(dst, as_Address(src));
312  }
313}
314
315void MacroAssembler::movptr(ArrayAddress dst, Register src) {
316  movl(as_Address(dst), src);
317}
318
319void MacroAssembler::movptr(Register dst, ArrayAddress src) {
320  movl(dst, as_Address(src));
321}
322
323// src should NEVER be a real pointer. Use AddressLiteral for true pointers
324void MacroAssembler::movptr(Address dst, intptr_t src) {
325  movl(dst, src);
326}
327
328
329void MacroAssembler::pop_callee_saved_registers() {
330  pop(rcx);
331  pop(rdx);
332  pop(rdi);
333  pop(rsi);
334}
335
336void MacroAssembler::pop_fTOS() {
337  fld_d(Address(rsp, 0));
338  addl(rsp, 2 * wordSize);
339}
340
341void MacroAssembler::push_callee_saved_registers() {
342  push(rsi);
343  push(rdi);
344  push(rdx);
345  push(rcx);
346}
347
348void MacroAssembler::push_fTOS() {
349  subl(rsp, 2 * wordSize);
350  fstp_d(Address(rsp, 0));
351}
352
353
354void MacroAssembler::pushoop(jobject obj) {
355  push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
356}
357
358void MacroAssembler::pushklass(Metadata* obj) {
359  push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
360}
361
362void MacroAssembler::pushptr(AddressLiteral src) {
363  if (src.is_lval()) {
364    push_literal32((int32_t)src.target(), src.rspec());
365  } else {
366    pushl(as_Address(src));
367  }
368}
369
370void MacroAssembler::set_word_if_not_zero(Register dst) {
371  xorl(dst, dst);
372  set_byte_if_not_zero(dst);
373}
374
375static void pass_arg0(MacroAssembler* masm, Register arg) {
376  masm->push(arg);
377}
378
379static void pass_arg1(MacroAssembler* masm, Register arg) {
380  masm->push(arg);
381}
382
383static void pass_arg2(MacroAssembler* masm, Register arg) {
384  masm->push(arg);
385}
386
387static void pass_arg3(MacroAssembler* masm, Register arg) {
388  masm->push(arg);
389}
390
391#ifndef PRODUCT
392extern "C" void findpc(intptr_t x);
393#endif
394
395void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
396  // In order to get locks to work, we need to fake a in_VM state
397  JavaThread* thread = JavaThread::current();
398  JavaThreadState saved_state = thread->thread_state();
399  thread->set_thread_state(_thread_in_vm);
400  if (ShowMessageBoxOnError) {
401    JavaThread* thread = JavaThread::current();
402    JavaThreadState saved_state = thread->thread_state();
403    thread->set_thread_state(_thread_in_vm);
404    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
405      ttyLocker ttyl;
406      BytecodeCounter::print();
407    }
408    // To see where a verify_oop failed, get $ebx+40/X for this frame.
409    // This is the value of eip which points to where verify_oop will return.
410    if (os::message_box(msg, "Execution stopped, print registers?")) {
411      print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
412      BREAKPOINT;
413    }
414  } else {
415    ttyLocker ttyl;
416    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
417  }
418  // Don't assert holding the ttyLock
419    assert(false, err_msg("DEBUG MESSAGE: %s", msg));
420  ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
421}
422
423void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
424  ttyLocker ttyl;
425  FlagSetting fs(Debugging, true);
426  tty->print_cr("eip = 0x%08x", eip);
427#ifndef PRODUCT
428  if ((WizardMode || Verbose) && PrintMiscellaneous) {
429    tty->cr();
430    findpc(eip);
431    tty->cr();
432  }
433#endif
434#define PRINT_REG(rax) \
435  { tty->print("%s = ", #rax); os::print_location(tty, rax); }
436  PRINT_REG(rax);
437  PRINT_REG(rbx);
438  PRINT_REG(rcx);
439  PRINT_REG(rdx);
440  PRINT_REG(rdi);
441  PRINT_REG(rsi);
442  PRINT_REG(rbp);
443  PRINT_REG(rsp);
444#undef PRINT_REG
445  // Print some words near top of staack.
446  int* dump_sp = (int*) rsp;
447  for (int col1 = 0; col1 < 8; col1++) {
448    tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
449    os::print_location(tty, *dump_sp++);
450  }
451  for (int row = 0; row < 16; row++) {
452    tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
453    for (int col = 0; col < 8; col++) {
454      tty->print(" 0x%08x", *dump_sp++);
455    }
456    tty->cr();
457  }
458  // Print some instructions around pc:
459  Disassembler::decode((address)eip-64, (address)eip);
460  tty->print_cr("--------");
461  Disassembler::decode((address)eip, (address)eip+32);
462}
463
464void MacroAssembler::stop(const char* msg) {
465  ExternalAddress message((address)msg);
466  // push address of message
467  pushptr(message.addr());
468  { Label L; call(L, relocInfo::none); bind(L); }     // push eip
469  pusha();                                            // push registers
470  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
471  hlt();
472}
473
474void MacroAssembler::warn(const char* msg) {
475  push_CPU_state();
476
477  ExternalAddress message((address) msg);
478  // push address of message
479  pushptr(message.addr());
480
481  call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
482  addl(rsp, wordSize);       // discard argument
483  pop_CPU_state();
484}
485
486void MacroAssembler::print_state() {
487  { Label L; call(L, relocInfo::none); bind(L); }     // push eip
488  pusha();                                            // push registers
489
490  push_CPU_state();
491  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
492  pop_CPU_state();
493
494  popa();
495  addl(rsp, wordSize);
496}
497
498#else // _LP64
499
500// 64 bit versions
501
502Address MacroAssembler::as_Address(AddressLiteral adr) {
503  // amd64 always does this as a pc-rel
504  // we can be absolute or disp based on the instruction type
505  // jmp/call are displacements others are absolute
506  assert(!adr.is_lval(), "must be rval");
507  assert(reachable(adr), "must be");
508  return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
509
510}
511
512Address MacroAssembler::as_Address(ArrayAddress adr) {
513  AddressLiteral base = adr.base();
514  lea(rscratch1, base);
515  Address index = adr.index();
516  assert(index._disp == 0, "must not have disp"); // maybe it can?
517  Address array(rscratch1, index._index, index._scale, index._disp);
518  return array;
519}
520
521void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
522  Label L, E;
523
524#ifdef _WIN64
525  // Windows always allocates space for it's register args
526  assert(num_args <= 4, "only register arguments supported");
527  subq(rsp,  frame::arg_reg_save_area_bytes);
528#endif
529
530  // Align stack if necessary
531  testl(rsp, 15);
532  jcc(Assembler::zero, L);
533
534  subq(rsp, 8);
535  {
536    call(RuntimeAddress(entry_point));
537  }
538  addq(rsp, 8);
539  jmp(E);
540
541  bind(L);
542  {
543    call(RuntimeAddress(entry_point));
544  }
545
546  bind(E);
547
548#ifdef _WIN64
549  // restore stack pointer
550  addq(rsp, frame::arg_reg_save_area_bytes);
551#endif
552
553}
554
555void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
556  assert(!src2.is_lval(), "should use cmpptr");
557
558  if (reachable(src2)) {
559    cmpq(src1, as_Address(src2));
560  } else {
561    lea(rscratch1, src2);
562    Assembler::cmpq(src1, Address(rscratch1, 0));
563  }
564}
565
566int MacroAssembler::corrected_idivq(Register reg) {
567  // Full implementation of Java ldiv and lrem; checks for special
568  // case as described in JVM spec., p.243 & p.271.  The function
569  // returns the (pc) offset of the idivl instruction - may be needed
570  // for implicit exceptions.
571  //
572  //         normal case                           special case
573  //
574  // input : rax: dividend                         min_long
575  //         reg: divisor   (may not be eax/edx)   -1
576  //
577  // output: rax: quotient  (= rax idiv reg)       min_long
578  //         rdx: remainder (= rax irem reg)       0
579  assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
580  static const int64_t min_long = 0x8000000000000000;
581  Label normal_case, special_case;
582
583  // check for special case
584  cmp64(rax, ExternalAddress((address) &min_long));
585  jcc(Assembler::notEqual, normal_case);
586  xorl(rdx, rdx); // prepare rdx for possible special case (where
587                  // remainder = 0)
588  cmpq(reg, -1);
589  jcc(Assembler::equal, special_case);
590
591  // handle normal case
592  bind(normal_case);
593  cdqq();
594  int idivq_offset = offset();
595  idivq(reg);
596
597  // normal and special case exit
598  bind(special_case);
599
600  return idivq_offset;
601}
602
603void MacroAssembler::decrementq(Register reg, int value) {
604  if (value == min_jint) { subq(reg, value); return; }
605  if (value <  0) { incrementq(reg, -value); return; }
606  if (value == 0) {                        ; return; }
607  if (value == 1 && UseIncDec) { decq(reg) ; return; }
608  /* else */      { subq(reg, value)       ; return; }
609}
610
611void MacroAssembler::decrementq(Address dst, int value) {
612  if (value == min_jint) { subq(dst, value); return; }
613  if (value <  0) { incrementq(dst, -value); return; }
614  if (value == 0) {                        ; return; }
615  if (value == 1 && UseIncDec) { decq(dst) ; return; }
616  /* else */      { subq(dst, value)       ; return; }
617}
618
619void MacroAssembler::incrementq(AddressLiteral dst) {
620  if (reachable(dst)) {
621    incrementq(as_Address(dst));
622  } else {
623    lea(rscratch1, dst);
624    incrementq(Address(rscratch1, 0));
625  }
626}
627
628void MacroAssembler::incrementq(Register reg, int value) {
629  if (value == min_jint) { addq(reg, value); return; }
630  if (value <  0) { decrementq(reg, -value); return; }
631  if (value == 0) {                        ; return; }
632  if (value == 1 && UseIncDec) { incq(reg) ; return; }
633  /* else */      { addq(reg, value)       ; return; }
634}
635
636void MacroAssembler::incrementq(Address dst, int value) {
637  if (value == min_jint) { addq(dst, value); return; }
638  if (value <  0) { decrementq(dst, -value); return; }
639  if (value == 0) {                        ; return; }
640  if (value == 1 && UseIncDec) { incq(dst) ; return; }
641  /* else */      { addq(dst, value)       ; return; }
642}
643
644// 32bit can do a case table jump in one instruction but we no longer allow the base
645// to be installed in the Address class
646void MacroAssembler::jump(ArrayAddress entry) {
647  lea(rscratch1, entry.base());
648  Address dispatch = entry.index();
649  assert(dispatch._base == noreg, "must be");
650  dispatch._base = rscratch1;
651  jmp(dispatch);
652}
653
654void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
655  ShouldNotReachHere(); // 64bit doesn't use two regs
656  cmpq(x_lo, y_lo);
657}
658
659void MacroAssembler::lea(Register dst, AddressLiteral src) {
660    mov_literal64(dst, (intptr_t)src.target(), src.rspec());
661}
662
663void MacroAssembler::lea(Address dst, AddressLiteral adr) {
664  mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
665  movptr(dst, rscratch1);
666}
667
668void MacroAssembler::leave() {
669  // %%% is this really better? Why not on 32bit too?
670  emit_int8((unsigned char)0xC9); // LEAVE
671}
672
673void MacroAssembler::lneg(Register hi, Register lo) {
674  ShouldNotReachHere(); // 64bit doesn't use two regs
675  negq(lo);
676}
677
678void MacroAssembler::movoop(Register dst, jobject obj) {
679  mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
680}
681
682void MacroAssembler::movoop(Address dst, jobject obj) {
683  mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
684  movq(dst, rscratch1);
685}
686
687void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
688  mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
689}
690
691void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
692  mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
693  movq(dst, rscratch1);
694}
695
696void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
697  if (src.is_lval()) {
698    mov_literal64(dst, (intptr_t)src.target(), src.rspec());
699  } else {
700    if (reachable(src)) {
701      movq(dst, as_Address(src));
702    } else {
703      lea(scratch, src);
704      movq(dst, Address(scratch, 0));
705    }
706  }
707}
708
709void MacroAssembler::movptr(ArrayAddress dst, Register src) {
710  movq(as_Address(dst), src);
711}
712
713void MacroAssembler::movptr(Register dst, ArrayAddress src) {
714  movq(dst, as_Address(src));
715}
716
717// src should NEVER be a real pointer. Use AddressLiteral for true pointers
718void MacroAssembler::movptr(Address dst, intptr_t src) {
719  mov64(rscratch1, src);
720  movq(dst, rscratch1);
721}
722
723// These are mostly for initializing NULL
724void MacroAssembler::movptr(Address dst, int32_t src) {
725  movslq(dst, src);
726}
727
728void MacroAssembler::movptr(Register dst, int32_t src) {
729  mov64(dst, (intptr_t)src);
730}
731
732void MacroAssembler::pushoop(jobject obj) {
733  movoop(rscratch1, obj);
734  push(rscratch1);
735}
736
737void MacroAssembler::pushklass(Metadata* obj) {
738  mov_metadata(rscratch1, obj);
739  push(rscratch1);
740}
741
742void MacroAssembler::pushptr(AddressLiteral src) {
743  lea(rscratch1, src);
744  if (src.is_lval()) {
745    push(rscratch1);
746  } else {
747    pushq(Address(rscratch1, 0));
748  }
749}
750
751void MacroAssembler::reset_last_Java_frame(bool clear_fp,
752                                           bool clear_pc) {
753  // we must set sp to zero to clear frame
754  movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
755  // must clear fp, so that compiled frames are not confused; it is
756  // possible that we need it only for debugging
757  if (clear_fp) {
758    movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
759  }
760
761  if (clear_pc) {
762    movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
763  }
764}
765
766void MacroAssembler::set_last_Java_frame(Register last_java_sp,
767                                         Register last_java_fp,
768                                         address  last_java_pc) {
769  // determine last_java_sp register
770  if (!last_java_sp->is_valid()) {
771    last_java_sp = rsp;
772  }
773
774  // last_java_fp is optional
775  if (last_java_fp->is_valid()) {
776    movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
777           last_java_fp);
778  }
779
780  // last_java_pc is optional
781  if (last_java_pc != NULL) {
782    Address java_pc(r15_thread,
783                    JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
784    lea(rscratch1, InternalAddress(last_java_pc));
785    movptr(java_pc, rscratch1);
786  }
787
788  movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
789}
790
791static void pass_arg0(MacroAssembler* masm, Register arg) {
792  if (c_rarg0 != arg ) {
793    masm->mov(c_rarg0, arg);
794  }
795}
796
797static void pass_arg1(MacroAssembler* masm, Register arg) {
798  if (c_rarg1 != arg ) {
799    masm->mov(c_rarg1, arg);
800  }
801}
802
803static void pass_arg2(MacroAssembler* masm, Register arg) {
804  if (c_rarg2 != arg ) {
805    masm->mov(c_rarg2, arg);
806  }
807}
808
809static void pass_arg3(MacroAssembler* masm, Register arg) {
810  if (c_rarg3 != arg ) {
811    masm->mov(c_rarg3, arg);
812  }
813}
814
815void MacroAssembler::stop(const char* msg) {
816  address rip = pc();
817  pusha(); // get regs on stack
818  lea(c_rarg0, ExternalAddress((address) msg));
819  lea(c_rarg1, InternalAddress(rip));
820  movq(c_rarg2, rsp); // pass pointer to regs array
821  andq(rsp, -16); // align stack as required by ABI
822  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
823  hlt();
824}
825
826void MacroAssembler::warn(const char* msg) {
827  push(rbp);
828  movq(rbp, rsp);
829  andq(rsp, -16);     // align stack as required by push_CPU_state and call
830  push_CPU_state();   // keeps alignment at 16 bytes
831  lea(c_rarg0, ExternalAddress((address) msg));
832  call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0);
833  pop_CPU_state();
834  mov(rsp, rbp);
835  pop(rbp);
836}
837
838void MacroAssembler::print_state() {
839  address rip = pc();
840  pusha();            // get regs on stack
841  push(rbp);
842  movq(rbp, rsp);
843  andq(rsp, -16);     // align stack as required by push_CPU_state and call
844  push_CPU_state();   // keeps alignment at 16 bytes
845
846  lea(c_rarg0, InternalAddress(rip));
847  lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
848  call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
849
850  pop_CPU_state();
851  mov(rsp, rbp);
852  pop(rbp);
853  popa();
854}
855
856#ifndef PRODUCT
857extern "C" void findpc(intptr_t x);
858#endif
859
860void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
861  // In order to get locks to work, we need to fake a in_VM state
862  if (ShowMessageBoxOnError) {
863    JavaThread* thread = JavaThread::current();
864    JavaThreadState saved_state = thread->thread_state();
865    thread->set_thread_state(_thread_in_vm);
866#ifndef PRODUCT
867    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
868      ttyLocker ttyl;
869      BytecodeCounter::print();
870    }
871#endif
872    // To see where a verify_oop failed, get $ebx+40/X for this frame.
873    // XXX correct this offset for amd64
874    // This is the value of eip which points to where verify_oop will return.
875    if (os::message_box(msg, "Execution stopped, print registers?")) {
876      print_state64(pc, regs);
877      BREAKPOINT;
878      assert(false, "start up GDB");
879    }
880    ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
881  } else {
882    ttyLocker ttyl;
883    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
884                    msg);
885    assert(false, err_msg("DEBUG MESSAGE: %s", msg));
886  }
887}
888
889void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
890  ttyLocker ttyl;
891  FlagSetting fs(Debugging, true);
892  tty->print_cr("rip = 0x%016lx", pc);
893#ifndef PRODUCT
894  tty->cr();
895  findpc(pc);
896  tty->cr();
897#endif
898#define PRINT_REG(rax, value) \
899  { tty->print("%s = ", #rax); os::print_location(tty, value); }
900  PRINT_REG(rax, regs[15]);
901  PRINT_REG(rbx, regs[12]);
902  PRINT_REG(rcx, regs[14]);
903  PRINT_REG(rdx, regs[13]);
904  PRINT_REG(rdi, regs[8]);
905  PRINT_REG(rsi, regs[9]);
906  PRINT_REG(rbp, regs[10]);
907  PRINT_REG(rsp, regs[11]);
908  PRINT_REG(r8 , regs[7]);
909  PRINT_REG(r9 , regs[6]);
910  PRINT_REG(r10, regs[5]);
911  PRINT_REG(r11, regs[4]);
912  PRINT_REG(r12, regs[3]);
913  PRINT_REG(r13, regs[2]);
914  PRINT_REG(r14, regs[1]);
915  PRINT_REG(r15, regs[0]);
916#undef PRINT_REG
917  // Print some words near top of staack.
918  int64_t* rsp = (int64_t*) regs[11];
919  int64_t* dump_sp = rsp;
920  for (int col1 = 0; col1 < 8; col1++) {
921    tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
922    os::print_location(tty, *dump_sp++);
923  }
924  for (int row = 0; row < 25; row++) {
925    tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
926    for (int col = 0; col < 4; col++) {
927      tty->print(" 0x%016lx", *dump_sp++);
928    }
929    tty->cr();
930  }
931  // Print some instructions around pc:
932  Disassembler::decode((address)pc-64, (address)pc);
933  tty->print_cr("--------");
934  Disassembler::decode((address)pc, (address)pc+32);
935}
936
937#endif // _LP64
938
939// Now versions that are common to 32/64 bit
940
941void MacroAssembler::addptr(Register dst, int32_t imm32) {
942  LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
943}
944
945void MacroAssembler::addptr(Register dst, Register src) {
946  LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
947}
948
949void MacroAssembler::addptr(Address dst, Register src) {
950  LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
951}
952
953void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
954  if (reachable(src)) {
955    Assembler::addsd(dst, as_Address(src));
956  } else {
957    lea(rscratch1, src);
958    Assembler::addsd(dst, Address(rscratch1, 0));
959  }
960}
961
962void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
963  if (reachable(src)) {
964    addss(dst, as_Address(src));
965  } else {
966    lea(rscratch1, src);
967    addss(dst, Address(rscratch1, 0));
968  }
969}
970
971void MacroAssembler::align(int modulus) {
972  if (offset() % modulus != 0) {
973    nop(modulus - (offset() % modulus));
974  }
975}
976
977void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
978  // Used in sign-masking with aligned address.
979  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
980  if (reachable(src)) {
981    Assembler::andpd(dst, as_Address(src));
982  } else {
983    lea(rscratch1, src);
984    Assembler::andpd(dst, Address(rscratch1, 0));
985  }
986}
987
988void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
989  // Used in sign-masking with aligned address.
990  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
991  if (reachable(src)) {
992    Assembler::andps(dst, as_Address(src));
993  } else {
994    lea(rscratch1, src);
995    Assembler::andps(dst, Address(rscratch1, 0));
996  }
997}
998
999void MacroAssembler::andptr(Register dst, int32_t imm32) {
1000  LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1001}
1002
1003void MacroAssembler::atomic_incl(Address counter_addr) {
1004  if (os::is_MP())
1005    lock();
1006  incrementl(counter_addr);
1007}
1008
1009void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1010  if (reachable(counter_addr)) {
1011    atomic_incl(as_Address(counter_addr));
1012  } else {
1013    lea(scr, counter_addr);
1014    atomic_incl(Address(scr, 0));
1015  }
1016}
1017
1018#ifdef _LP64
1019void MacroAssembler::atomic_incq(Address counter_addr) {
1020  if (os::is_MP())
1021    lock();
1022  incrementq(counter_addr);
1023}
1024
1025void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1026  if (reachable(counter_addr)) {
1027    atomic_incq(as_Address(counter_addr));
1028  } else {
1029    lea(scr, counter_addr);
1030    atomic_incq(Address(scr, 0));
1031  }
1032}
1033#endif
1034
1035// Writes to stack successive pages until offset reached to check for
1036// stack overflow + shadow pages.  This clobbers tmp.
1037void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1038  movptr(tmp, rsp);
1039  // Bang stack for total size given plus shadow page size.
1040  // Bang one page at a time because large size can bang beyond yellow and
1041  // red zones.
1042  Label loop;
1043  bind(loop);
1044  movl(Address(tmp, (-os::vm_page_size())), size );
1045  subptr(tmp, os::vm_page_size());
1046  subl(size, os::vm_page_size());
1047  jcc(Assembler::greater, loop);
1048
1049  // Bang down shadow pages too.
1050  // At this point, (tmp-0) is the last address touched, so don't
1051  // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1052  // was post-decremented.)  Skip this address by starting at i=1, and
1053  // touch a few more pages below.  N.B.  It is important to touch all
1054  // the way down to and including i=StackShadowPages.
1055  for (int i = 1; i < StackShadowPages; i++) {
1056    // this could be any sized move but this is can be a debugging crumb
1057    // so the bigger the better.
1058    movptr(Address(tmp, (-i*os::vm_page_size())), size );
1059  }
1060}
1061
1062int MacroAssembler::biased_locking_enter(Register lock_reg,
1063                                         Register obj_reg,
1064                                         Register swap_reg,
1065                                         Register tmp_reg,
1066                                         bool swap_reg_contains_mark,
1067                                         Label& done,
1068                                         Label* slow_case,
1069                                         BiasedLockingCounters* counters) {
1070  assert(UseBiasedLocking, "why call this otherwise?");
1071  assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1072  LP64_ONLY( assert(tmp_reg != noreg, "tmp_reg must be supplied"); )
1073  bool need_tmp_reg = false;
1074  if (tmp_reg == noreg) {
1075    need_tmp_reg = true;
1076    tmp_reg = lock_reg;
1077    assert_different_registers(lock_reg, obj_reg, swap_reg);
1078  } else {
1079    assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1080  }
1081  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
1082  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1083  Address saved_mark_addr(lock_reg, 0);
1084
1085  if (PrintBiasedLockingStatistics && counters == NULL) {
1086    counters = BiasedLocking::counters();
1087  }
1088  // Biased locking
1089  // See whether the lock is currently biased toward our thread and
1090  // whether the epoch is still valid
1091  // Note that the runtime guarantees sufficient alignment of JavaThread
1092  // pointers to allow age to be placed into low bits
1093  // First check to see whether biasing is even enabled for this object
1094  Label cas_label;
1095  int null_check_offset = -1;
1096  if (!swap_reg_contains_mark) {
1097    null_check_offset = offset();
1098    movptr(swap_reg, mark_addr);
1099  }
1100  if (need_tmp_reg) {
1101    push(tmp_reg);
1102  }
1103  movptr(tmp_reg, swap_reg);
1104  andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
1105  cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
1106  if (need_tmp_reg) {
1107    pop(tmp_reg);
1108  }
1109  jcc(Assembler::notEqual, cas_label);
1110  // The bias pattern is present in the object's header. Need to check
1111  // whether the bias owner and the epoch are both still current.
1112#ifndef _LP64
1113  // Note that because there is no current thread register on x86_32 we
1114  // need to store off the mark word we read out of the object to
1115  // avoid reloading it and needing to recheck invariants below. This
1116  // store is unfortunate but it makes the overall code shorter and
1117  // simpler.
1118  movptr(saved_mark_addr, swap_reg);
1119#endif
1120  if (need_tmp_reg) {
1121    push(tmp_reg);
1122  }
1123  if (swap_reg_contains_mark) {
1124    null_check_offset = offset();
1125  }
1126  load_prototype_header(tmp_reg, obj_reg);
1127#ifdef _LP64
1128  orptr(tmp_reg, r15_thread);
1129  xorptr(tmp_reg, swap_reg);
1130  Register header_reg = tmp_reg;
1131#else
1132  xorptr(tmp_reg, swap_reg);
1133  get_thread(swap_reg);
1134  xorptr(swap_reg, tmp_reg);
1135  Register header_reg = swap_reg;
1136#endif
1137  andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
1138  if (need_tmp_reg) {
1139    pop(tmp_reg);
1140  }
1141  if (counters != NULL) {
1142    cond_inc32(Assembler::zero,
1143               ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1144  }
1145  jcc(Assembler::equal, done);
1146
1147  Label try_revoke_bias;
1148  Label try_rebias;
1149
1150  // At this point we know that the header has the bias pattern and
1151  // that we are not the bias owner in the current epoch. We need to
1152  // figure out more details about the state of the header in order to
1153  // know what operations can be legally performed on the object's
1154  // header.
1155
1156  // If the low three bits in the xor result aren't clear, that means
1157  // the prototype header is no longer biased and we have to revoke
1158  // the bias on this object.
1159  testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
1160  jccb(Assembler::notZero, try_revoke_bias);
1161
1162  // Biasing is still enabled for this data type. See whether the
1163  // epoch of the current bias is still valid, meaning that the epoch
1164  // bits of the mark word are equal to the epoch bits of the
1165  // prototype header. (Note that the prototype header's epoch bits
1166  // only change at a safepoint.) If not, attempt to rebias the object
1167  // toward the current thread. Note that we must be absolutely sure
1168  // that the current epoch is invalid in order to do this because
1169  // otherwise the manipulations it performs on the mark word are
1170  // illegal.
1171  testptr(header_reg, markOopDesc::epoch_mask_in_place);
1172  jccb(Assembler::notZero, try_rebias);
1173
1174  // The epoch of the current bias is still valid but we know nothing
1175  // about the owner; it might be set or it might be clear. Try to
1176  // acquire the bias of the object using an atomic operation. If this
1177  // fails we will go in to the runtime to revoke the object's bias.
1178  // Note that we first construct the presumed unbiased header so we
1179  // don't accidentally blow away another thread's valid bias.
1180  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1181  andptr(swap_reg,
1182         markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
1183  if (need_tmp_reg) {
1184    push(tmp_reg);
1185  }
1186#ifdef _LP64
1187  movptr(tmp_reg, swap_reg);
1188  orptr(tmp_reg, r15_thread);
1189#else
1190  get_thread(tmp_reg);
1191  orptr(tmp_reg, swap_reg);
1192#endif
1193  if (os::is_MP()) {
1194    lock();
1195  }
1196  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1197  if (need_tmp_reg) {
1198    pop(tmp_reg);
1199  }
1200  // If the biasing toward our thread failed, this means that
1201  // another thread succeeded in biasing it toward itself and we
1202  // need to revoke that bias. The revocation will occur in the
1203  // interpreter runtime in the slow case.
1204  if (counters != NULL) {
1205    cond_inc32(Assembler::zero,
1206               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1207  }
1208  if (slow_case != NULL) {
1209    jcc(Assembler::notZero, *slow_case);
1210  }
1211  jmp(done);
1212
1213  bind(try_rebias);
1214  // At this point we know the epoch has expired, meaning that the
1215  // current "bias owner", if any, is actually invalid. Under these
1216  // circumstances _only_, we are allowed to use the current header's
1217  // value as the comparison value when doing the cas to acquire the
1218  // bias in the current epoch. In other words, we allow transfer of
1219  // the bias from one thread to another directly in this situation.
1220  //
1221  // FIXME: due to a lack of registers we currently blow away the age
1222  // bits in this situation. Should attempt to preserve them.
1223  if (need_tmp_reg) {
1224    push(tmp_reg);
1225  }
1226  load_prototype_header(tmp_reg, obj_reg);
1227#ifdef _LP64
1228  orptr(tmp_reg, r15_thread);
1229#else
1230  get_thread(swap_reg);
1231  orptr(tmp_reg, swap_reg);
1232  movptr(swap_reg, saved_mark_addr);
1233#endif
1234  if (os::is_MP()) {
1235    lock();
1236  }
1237  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1238  if (need_tmp_reg) {
1239    pop(tmp_reg);
1240  }
1241  // If the biasing toward our thread failed, then another thread
1242  // succeeded in biasing it toward itself and we need to revoke that
1243  // bias. The revocation will occur in the runtime in the slow case.
1244  if (counters != NULL) {
1245    cond_inc32(Assembler::zero,
1246               ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1247  }
1248  if (slow_case != NULL) {
1249    jcc(Assembler::notZero, *slow_case);
1250  }
1251  jmp(done);
1252
1253  bind(try_revoke_bias);
1254  // The prototype mark in the klass doesn't have the bias bit set any
1255  // more, indicating that objects of this data type are not supposed
1256  // to be biased any more. We are going to try to reset the mark of
1257  // this object to the prototype value and fall through to the
1258  // CAS-based locking scheme. Note that if our CAS fails, it means
1259  // that another thread raced us for the privilege of revoking the
1260  // bias of this particular object, so it's okay to continue in the
1261  // normal locking code.
1262  //
1263  // FIXME: due to a lack of registers we currently blow away the age
1264  // bits in this situation. Should attempt to preserve them.
1265  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1266  if (need_tmp_reg) {
1267    push(tmp_reg);
1268  }
1269  load_prototype_header(tmp_reg, obj_reg);
1270  if (os::is_MP()) {
1271    lock();
1272  }
1273  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1274  if (need_tmp_reg) {
1275    pop(tmp_reg);
1276  }
1277  // Fall through to the normal CAS-based lock, because no matter what
1278  // the result of the above CAS, some thread must have succeeded in
1279  // removing the bias bit from the object's header.
1280  if (counters != NULL) {
1281    cond_inc32(Assembler::zero,
1282               ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1283  }
1284
1285  bind(cas_label);
1286
1287  return null_check_offset;
1288}
1289
1290void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1291  assert(UseBiasedLocking, "why call this otherwise?");
1292
1293  // Check for biased locking unlock case, which is a no-op
1294  // Note: we do not have to check the thread ID for two reasons.
1295  // First, the interpreter checks for IllegalMonitorStateException at
1296  // a higher level. Second, if the bias was revoked while we held the
1297  // lock, the object could not be rebiased toward another thread, so
1298  // the bias bit would be clear.
1299  movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1300  andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
1301  cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
1302  jcc(Assembler::equal, done);
1303}
1304
1305#ifdef COMPILER2
1306
1307#if INCLUDE_RTM_OPT
1308
1309// Update rtm_counters based on abort status
1310// input: abort_status
1311//        rtm_counters (RTMLockingCounters*)
1312// flags are killed
1313void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1314
1315  atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1316  if (PrintPreciseRTMLockingStatistics) {
1317    for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1318      Label check_abort;
1319      testl(abort_status, (1<<i));
1320      jccb(Assembler::equal, check_abort);
1321      atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1322      bind(check_abort);
1323    }
1324  }
1325}
1326
1327// Branch if (random & (count-1) != 0), count is 2^n
1328// tmp, scr and flags are killed
1329void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
1330  assert(tmp == rax, "");
1331  assert(scr == rdx, "");
1332  rdtsc(); // modifies EDX:EAX
1333  andptr(tmp, count-1);
1334  jccb(Assembler::notZero, brLabel);
1335}
1336
1337// Perform abort ratio calculation, set no_rtm bit if high ratio
1338// input:  rtm_counters_Reg (RTMLockingCounters* address)
1339// tmpReg, rtm_counters_Reg and flags are killed
1340void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
1341                                                 Register rtm_counters_Reg,
1342                                                 RTMLockingCounters* rtm_counters,
1343                                                 Metadata* method_data) {
1344  Label L_done, L_check_always_rtm1, L_check_always_rtm2;
1345
1346  if (RTMLockingCalculationDelay > 0) {
1347    // Delay calculation
1348    movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
1349    testptr(tmpReg, tmpReg);
1350    jccb(Assembler::equal, L_done);
1351  }
1352  // Abort ratio calculation only if abort_count > RTMAbortThreshold
1353  //   Aborted transactions = abort_count * 100
1354  //   All transactions = total_count *  RTMTotalCountIncrRate
1355  //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
1356
1357  movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
1358  cmpptr(tmpReg, RTMAbortThreshold);
1359  jccb(Assembler::below, L_check_always_rtm2);
1360  imulptr(tmpReg, tmpReg, 100);
1361
1362  Register scrReg = rtm_counters_Reg;
1363  movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1364  imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
1365  imulptr(scrReg, scrReg, RTMAbortRatio);
1366  cmpptr(tmpReg, scrReg);
1367  jccb(Assembler::below, L_check_always_rtm1);
1368  if (method_data != NULL) {
1369    // set rtm_state to "no rtm" in MDO
1370    mov_metadata(tmpReg, method_data);
1371    if (os::is_MP()) {
1372      lock();
1373    }
1374    orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
1375  }
1376  jmpb(L_done);
1377  bind(L_check_always_rtm1);
1378  // Reload RTMLockingCounters* address
1379  lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1380  bind(L_check_always_rtm2);
1381  movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1382  cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
1383  jccb(Assembler::below, L_done);
1384  if (method_data != NULL) {
1385    // set rtm_state to "always rtm" in MDO
1386    mov_metadata(tmpReg, method_data);
1387    if (os::is_MP()) {
1388      lock();
1389    }
1390    orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
1391  }
1392  bind(L_done);
1393}
1394
1395// Update counters and perform abort ratio calculation
1396// input:  abort_status_Reg
1397// rtm_counters_Reg, flags are killed
1398void MacroAssembler::rtm_profiling(Register abort_status_Reg,
1399                                   Register rtm_counters_Reg,
1400                                   RTMLockingCounters* rtm_counters,
1401                                   Metadata* method_data,
1402                                   bool profile_rtm) {
1403
1404  assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1405  // update rtm counters based on rax value at abort
1406  // reads abort_status_Reg, updates flags
1407  lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1408  rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
1409  if (profile_rtm) {
1410    // Save abort status because abort_status_Reg is used by following code.
1411    if (RTMRetryCount > 0) {
1412      push(abort_status_Reg);
1413    }
1414    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1415    rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
1416    // restore abort status
1417    if (RTMRetryCount > 0) {
1418      pop(abort_status_Reg);
1419    }
1420  }
1421}
1422
1423// Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
1424// inputs: retry_count_Reg
1425//       : abort_status_Reg
1426// output: retry_count_Reg decremented by 1
1427// flags are killed
1428void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
1429  Label doneRetry;
1430  assert(abort_status_Reg == rax, "");
1431  // The abort reason bits are in eax (see all states in rtmLocking.hpp)
1432  // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
1433  // if reason is in 0x6 and retry count != 0 then retry
1434  andptr(abort_status_Reg, 0x6);
1435  jccb(Assembler::zero, doneRetry);
1436  testl(retry_count_Reg, retry_count_Reg);
1437  jccb(Assembler::zero, doneRetry);
1438  pause();
1439  decrementl(retry_count_Reg);
1440  jmp(retryLabel);
1441  bind(doneRetry);
1442}
1443
1444// Spin and retry if lock is busy,
1445// inputs: box_Reg (monitor address)
1446//       : retry_count_Reg
1447// output: retry_count_Reg decremented by 1
1448//       : clear z flag if retry count exceeded
1449// tmp_Reg, scr_Reg, flags are killed
1450void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
1451                                            Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
1452  Label SpinLoop, SpinExit, doneRetry;
1453  int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1454
1455  testl(retry_count_Reg, retry_count_Reg);
1456  jccb(Assembler::zero, doneRetry);
1457  decrementl(retry_count_Reg);
1458  movptr(scr_Reg, RTMSpinLoopCount);
1459
1460  bind(SpinLoop);
1461  pause();
1462  decrementl(scr_Reg);
1463  jccb(Assembler::lessEqual, SpinExit);
1464  movptr(tmp_Reg, Address(box_Reg, owner_offset));
1465  testptr(tmp_Reg, tmp_Reg);
1466  jccb(Assembler::notZero, SpinLoop);
1467
1468  bind(SpinExit);
1469  jmp(retryLabel);
1470  bind(doneRetry);
1471  incrementl(retry_count_Reg); // clear z flag
1472}
1473
1474// Use RTM for normal stack locks
1475// Input: objReg (object to lock)
1476void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
1477                                       Register retry_on_abort_count_Reg,
1478                                       RTMLockingCounters* stack_rtm_counters,
1479                                       Metadata* method_data, bool profile_rtm,
1480                                       Label& DONE_LABEL, Label& IsInflated) {
1481  assert(UseRTMForStackLocks, "why call this otherwise?");
1482  assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1483  assert(tmpReg == rax, "");
1484  assert(scrReg == rdx, "");
1485  Label L_rtm_retry, L_decrement_retry, L_on_abort;
1486
1487  if (RTMRetryCount > 0) {
1488    movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1489    bind(L_rtm_retry);
1490  }
1491  movptr(tmpReg, Address(objReg, 0));
1492  testptr(tmpReg, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
1493  jcc(Assembler::notZero, IsInflated);
1494
1495  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1496    Label L_noincrement;
1497    if (RTMTotalCountIncrRate > 1) {
1498      // tmpReg, scrReg and flags are killed
1499      branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
1500    }
1501    assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
1502    atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
1503    bind(L_noincrement);
1504  }
1505  xbegin(L_on_abort);
1506  movptr(tmpReg, Address(objReg, 0));       // fetch markword
1507  andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1508  cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1509  jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
1510
1511  Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1512  if (UseRTMXendForLockBusy) {
1513    xend();
1514    movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
1515    jmp(L_decrement_retry);
1516  }
1517  else {
1518    xabort(0);
1519  }
1520  bind(L_on_abort);
1521  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1522    rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
1523  }
1524  bind(L_decrement_retry);
1525  if (RTMRetryCount > 0) {
1526    // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1527    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1528  }
1529}
1530
1531// Use RTM for inflating locks
1532// inputs: objReg (object to lock)
1533//         boxReg (on-stack box address (displaced header location) - KILLED)
1534//         tmpReg (ObjectMonitor address + markOopDesc::monitor_value)
1535void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1536                                          Register scrReg, Register retry_on_busy_count_Reg,
1537                                          Register retry_on_abort_count_Reg,
1538                                          RTMLockingCounters* rtm_counters,
1539                                          Metadata* method_data, bool profile_rtm,
1540                                          Label& DONE_LABEL) {
1541  assert(UseRTMLocking, "why call this otherwise?");
1542  assert(tmpReg == rax, "");
1543  assert(scrReg == rdx, "");
1544  Label L_rtm_retry, L_decrement_retry, L_on_abort;
1545  int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1546
1547  // Without cast to int32_t a movptr will destroy r10 which is typically obj
1548  movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1549  movptr(boxReg, tmpReg); // Save ObjectMonitor address
1550
1551  if (RTMRetryCount > 0) {
1552    movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
1553    movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1554    bind(L_rtm_retry);
1555  }
1556  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1557    Label L_noincrement;
1558    if (RTMTotalCountIncrRate > 1) {
1559      // tmpReg, scrReg and flags are killed
1560      branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
1561    }
1562    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1563    atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1564    bind(L_noincrement);
1565  }
1566  xbegin(L_on_abort);
1567  movptr(tmpReg, Address(objReg, 0));
1568  movptr(tmpReg, Address(tmpReg, owner_offset));
1569  testptr(tmpReg, tmpReg);
1570  jcc(Assembler::zero, DONE_LABEL);
1571  if (UseRTMXendForLockBusy) {
1572    xend();
1573    jmp(L_decrement_retry);
1574  }
1575  else {
1576    xabort(0);
1577  }
1578  bind(L_on_abort);
1579  Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1580  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1581    rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1582  }
1583  if (RTMRetryCount > 0) {
1584    // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1585    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1586  }
1587
1588  movptr(tmpReg, Address(boxReg, owner_offset)) ;
1589  testptr(tmpReg, tmpReg) ;
1590  jccb(Assembler::notZero, L_decrement_retry) ;
1591
1592  // Appears unlocked - try to swing _owner from null to non-null.
1593  // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1594#ifdef _LP64
1595  Register threadReg = r15_thread;
1596#else
1597  get_thread(scrReg);
1598  Register threadReg = scrReg;
1599#endif
1600  if (os::is_MP()) {
1601    lock();
1602  }
1603  cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1604
1605  if (RTMRetryCount > 0) {
1606    // success done else retry
1607    jccb(Assembler::equal, DONE_LABEL) ;
1608    bind(L_decrement_retry);
1609    // Spin and retry if lock is busy.
1610    rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1611  }
1612  else {
1613    bind(L_decrement_retry);
1614  }
1615}
1616
1617#endif //  INCLUDE_RTM_OPT
1618
1619// Fast_Lock and Fast_Unlock used by C2
1620
1621// Because the transitions from emitted code to the runtime
1622// monitorenter/exit helper stubs are so slow it's critical that
1623// we inline both the stack-locking fast-path and the inflated fast path.
1624//
1625// See also: cmpFastLock and cmpFastUnlock.
1626//
1627// What follows is a specialized inline transliteration of the code
1628// in slow_enter() and slow_exit().  If we're concerned about I$ bloat
1629// another option would be to emit TrySlowEnter and TrySlowExit methods
1630// at startup-time.  These methods would accept arguments as
1631// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1632// indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
1633// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1634// In practice, however, the # of lock sites is bounded and is usually small.
1635// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1636// if the processor uses simple bimodal branch predictors keyed by EIP
1637// Since the helper routines would be called from multiple synchronization
1638// sites.
1639//
1640// An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1641// in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1642// to those specialized methods.  That'd give us a mostly platform-independent
1643// implementation that the JITs could optimize and inline at their pleasure.
1644// Done correctly, the only time we'd need to cross to native could would be
1645// to park() or unpark() threads.  We'd also need a few more unsafe operators
1646// to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1647// (b) explicit barriers or fence operations.
1648//
1649// TODO:
1650//
1651// *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1652//    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1653//    Given TLAB allocation, Self is usually manifested in a register, so passing it into
1654//    the lock operators would typically be faster than reifying Self.
1655//
1656// *  Ideally I'd define the primitives as:
1657//       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1658//       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1659//    Unfortunately ADLC bugs prevent us from expressing the ideal form.
1660//    Instead, we're stuck with a rather awkward and brittle register assignments below.
1661//    Furthermore the register assignments are overconstrained, possibly resulting in
1662//    sub-optimal code near the synchronization site.
1663//
1664// *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
1665//    Alternately, use a better sp-proximity test.
1666//
1667// *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1668//    Either one is sufficient to uniquely identify a thread.
1669//    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1670//
1671// *  Intrinsify notify() and notifyAll() for the common cases where the
1672//    object is locked by the calling thread but the waitlist is empty.
1673//    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1674//
1675// *  use jccb and jmpb instead of jcc and jmp to improve code density.
1676//    But beware of excessive branch density on AMD Opterons.
1677//
1678// *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1679//    or failure of the fast-path.  If the fast-path fails then we pass
1680//    control to the slow-path, typically in C.  In Fast_Lock and
1681//    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1682//    will emit a conditional branch immediately after the node.
1683//    So we have branches to branches and lots of ICC.ZF games.
1684//    Instead, it might be better to have C2 pass a "FailureLabel"
1685//    into Fast_Lock and Fast_Unlock.  In the case of success, control
1686//    will drop through the node.  ICC.ZF is undefined at exit.
1687//    In the case of failure, the node will branch directly to the
1688//    FailureLabel
1689
1690
1691// obj: object to lock
1692// box: on-stack box address (displaced header location) - KILLED
1693// rax,: tmp -- KILLED
1694// scr: tmp -- KILLED
1695void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1696                               Register scrReg, Register cx1Reg, Register cx2Reg,
1697                               BiasedLockingCounters* counters,
1698                               RTMLockingCounters* rtm_counters,
1699                               RTMLockingCounters* stack_rtm_counters,
1700                               Metadata* method_data,
1701                               bool use_rtm, bool profile_rtm) {
1702  // Ensure the register assignents are disjoint
1703  assert(tmpReg == rax, "");
1704
1705  if (use_rtm) {
1706    assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1707  } else {
1708    assert(cx1Reg == noreg, "");
1709    assert(cx2Reg == noreg, "");
1710    assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1711  }
1712
1713  if (counters != NULL) {
1714    atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1715  }
1716  if (EmitSync & 1) {
1717      // set box->dhw = markOopDesc::unused_mark()
1718      // Force all sync thru slow-path: slow_enter() and slow_exit()
1719      movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1720      cmpptr (rsp, (int32_t)NULL_WORD);
1721  } else
1722  if (EmitSync & 2) {
1723      Label DONE_LABEL ;
1724      if (UseBiasedLocking) {
1725         // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
1726         biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1727      }
1728
1729      movptr(tmpReg, Address(objReg, 0));           // fetch markword
1730      orptr (tmpReg, 0x1);
1731      movptr(Address(boxReg, 0), tmpReg);           // Anticipate successful CAS
1732      if (os::is_MP()) {
1733        lock();
1734      }
1735      cmpxchgptr(boxReg, Address(objReg, 0));       // Updates tmpReg
1736      jccb(Assembler::equal, DONE_LABEL);
1737      // Recursive locking
1738      subptr(tmpReg, rsp);
1739      andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1740      movptr(Address(boxReg, 0), tmpReg);
1741      bind(DONE_LABEL);
1742  } else {
1743    // Possible cases that we'll encounter in fast_lock
1744    // ------------------------------------------------
1745    // * Inflated
1746    //    -- unlocked
1747    //    -- Locked
1748    //       = by self
1749    //       = by other
1750    // * biased
1751    //    -- by Self
1752    //    -- by other
1753    // * neutral
1754    // * stack-locked
1755    //    -- by self
1756    //       = sp-proximity test hits
1757    //       = sp-proximity test generates false-negative
1758    //    -- by other
1759    //
1760
1761    Label IsInflated, DONE_LABEL;
1762
1763    // it's stack-locked, biased or neutral
1764    // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1765    // order to reduce the number of conditional branches in the most common cases.
1766    // Beware -- there's a subtle invariant that fetch of the markword
1767    // at [FETCH], below, will never observe a biased encoding (*101b).
1768    // If this invariant is not held we risk exclusion (safety) failure.
1769    if (UseBiasedLocking && !UseOptoBiasInlining) {
1770      biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, counters);
1771    }
1772
1773#if INCLUDE_RTM_OPT
1774    if (UseRTMForStackLocks && use_rtm) {
1775      rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
1776                        stack_rtm_counters, method_data, profile_rtm,
1777                        DONE_LABEL, IsInflated);
1778    }
1779#endif // INCLUDE_RTM_OPT
1780
1781    movptr(tmpReg, Address(objReg, 0));          // [FETCH]
1782    testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1783    jccb(Assembler::notZero, IsInflated);
1784
1785    // Attempt stack-locking ...
1786    orptr (tmpReg, markOopDesc::unlocked_value);
1787    movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
1788    if (os::is_MP()) {
1789      lock();
1790    }
1791    cmpxchgptr(boxReg, Address(objReg, 0));      // Updates tmpReg
1792    if (counters != NULL) {
1793      cond_inc32(Assembler::equal,
1794                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1795    }
1796    jcc(Assembler::equal, DONE_LABEL);           // Success
1797
1798    // Recursive locking.
1799    // The object is stack-locked: markword contains stack pointer to BasicLock.
1800    // Locked by current thread if difference with current SP is less than one page.
1801    subptr(tmpReg, rsp);
1802    // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
1803    andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1804    movptr(Address(boxReg, 0), tmpReg);
1805    if (counters != NULL) {
1806      cond_inc32(Assembler::equal,
1807                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1808    }
1809    jmp(DONE_LABEL);
1810
1811    bind(IsInflated);
1812    // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value
1813
1814#if INCLUDE_RTM_OPT
1815    // Use the same RTM locking code in 32- and 64-bit VM.
1816    if (use_rtm) {
1817      rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1818                           rtm_counters, method_data, profile_rtm, DONE_LABEL);
1819    } else {
1820#endif // INCLUDE_RTM_OPT
1821
1822#ifndef _LP64
1823    // The object is inflated.
1824
1825    // boxReg refers to the on-stack BasicLock in the current frame.
1826    // We'd like to write:
1827    //   set box->_displaced_header = markOopDesc::unused_mark().  Any non-0 value suffices.
1828    // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
1829    // additional latency as we have another ST in the store buffer that must drain.
1830
1831    if (EmitSync & 8192) {
1832       movptr(Address(boxReg, 0), 3);            // results in ST-before-CAS penalty
1833       get_thread (scrReg);
1834       movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2]
1835       movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
1836       if (os::is_MP()) {
1837         lock();
1838       }
1839       cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1840    } else
1841    if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
1842       movptr(scrReg, boxReg);
1843       movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
1844
1845       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1846       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1847          // prefetchw [eax + Offset(_owner)-2]
1848          prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1849       }
1850
1851       if ((EmitSync & 64) == 0) {
1852         // Optimistic form: consider XORL tmpReg,tmpReg
1853         movptr(tmpReg, NULL_WORD);
1854       } else {
1855         // Can suffer RTS->RTO upgrades on shared or cold $ lines
1856         // Test-And-CAS instead of CAS
1857         movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));   // rax, = m->_owner
1858         testptr(tmpReg, tmpReg);                   // Locked ?
1859         jccb  (Assembler::notZero, DONE_LABEL);
1860       }
1861
1862       // Appears unlocked - try to swing _owner from null to non-null.
1863       // Ideally, I'd manifest "Self" with get_thread and then attempt
1864       // to CAS the register containing Self into m->Owner.
1865       // But we don't have enough registers, so instead we can either try to CAS
1866       // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1867       // we later store "Self" into m->Owner.  Transiently storing a stack address
1868       // (rsp or the address of the box) into  m->owner is harmless.
1869       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1870       if (os::is_MP()) {
1871         lock();
1872       }
1873       cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1874       movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1875       jccb  (Assembler::notZero, DONE_LABEL);
1876       get_thread (scrReg);                    // beware: clobbers ICCs
1877       movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1878       xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1879
1880       // If the CAS fails we can either retry or pass control to the slow-path.
1881       // We use the latter tactic.
1882       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1883       // If the CAS was successful ...
1884       //   Self has acquired the lock
1885       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1886       // Intentional fall-through into DONE_LABEL ...
1887    } else {
1888       movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark()));  // results in ST-before-CAS penalty
1889       movptr(boxReg, tmpReg);
1890
1891       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1892       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1893          // prefetchw [eax + Offset(_owner)-2]
1894          prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1895       }
1896
1897       if ((EmitSync & 64) == 0) {
1898         // Optimistic form
1899         xorptr  (tmpReg, tmpReg);
1900       } else {
1901         // Can suffer RTS->RTO upgrades on shared or cold $ lines
1902         movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));   // rax, = m->_owner
1903         testptr(tmpReg, tmpReg);                   // Locked ?
1904         jccb  (Assembler::notZero, DONE_LABEL);
1905       }
1906
1907       // Appears unlocked - try to swing _owner from null to non-null.
1908       // Use either "Self" (in scr) or rsp as thread identity in _owner.
1909       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1910       get_thread (scrReg);
1911       if (os::is_MP()) {
1912         lock();
1913       }
1914       cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1915
1916       // If the CAS fails we can either retry or pass control to the slow-path.
1917       // We use the latter tactic.
1918       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1919       // If the CAS was successful ...
1920       //   Self has acquired the lock
1921       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1922       // Intentional fall-through into DONE_LABEL ...
1923    }
1924#else // _LP64
1925    // It's inflated
1926
1927    // TODO: someday avoid the ST-before-CAS penalty by
1928    // relocating (deferring) the following ST.
1929    // We should also think about trying a CAS without having
1930    // fetched _owner.  If the CAS is successful we may
1931    // avoid an RTO->RTS upgrade on the $line.
1932
1933    // Without cast to int32_t a movptr will destroy r10 which is typically obj
1934    movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1935
1936    movptr (boxReg, tmpReg);
1937    movptr(tmpReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1938    testptr(tmpReg, tmpReg);
1939    jccb   (Assembler::notZero, DONE_LABEL);
1940
1941    // It's inflated and appears unlocked
1942    if (os::is_MP()) {
1943      lock();
1944    }
1945    cmpxchgptr(r15_thread, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1946    // Intentional fall-through into DONE_LABEL ...
1947#endif // _LP64
1948
1949#if INCLUDE_RTM_OPT
1950    } // use_rtm()
1951#endif
1952    // DONE_LABEL is a hot target - we'd really like to place it at the
1953    // start of cache line by padding with NOPs.
1954    // See the AMD and Intel software optimization manuals for the
1955    // most efficient "long" NOP encodings.
1956    // Unfortunately none of our alignment mechanisms suffice.
1957    bind(DONE_LABEL);
1958
1959    // At DONE_LABEL the icc ZFlag is set as follows ...
1960    // Fast_Unlock uses the same protocol.
1961    // ZFlag == 1 -> Success
1962    // ZFlag == 0 -> Failure - force control through the slow-path
1963  }
1964}
1965
1966// obj: object to unlock
1967// box: box address (displaced header location), killed.  Must be EAX.
1968// tmp: killed, cannot be obj nor box.
1969//
1970// Some commentary on balanced locking:
1971//
1972// Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1973// Methods that don't have provably balanced locking are forced to run in the
1974// interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1975// The interpreter provides two properties:
1976// I1:  At return-time the interpreter automatically and quietly unlocks any
1977//      objects acquired the current activation (frame).  Recall that the
1978//      interpreter maintains an on-stack list of locks currently held by
1979//      a frame.
1980// I2:  If a method attempts to unlock an object that is not held by the
1981//      the frame the interpreter throws IMSX.
1982//
1983// Lets say A(), which has provably balanced locking, acquires O and then calls B().
1984// B() doesn't have provably balanced locking so it runs in the interpreter.
1985// Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1986// is still locked by A().
1987//
1988// The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1989// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1990// should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1991// doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1992
1993void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1994  assert(boxReg == rax, "");
1995  assert_different_registers(objReg, boxReg, tmpReg);
1996
1997  if (EmitSync & 4) {
1998    // Disable - inhibit all inlining.  Force control through the slow-path
1999    cmpptr (rsp, 0);
2000  } else
2001  if (EmitSync & 8) {
2002    Label DONE_LABEL;
2003    if (UseBiasedLocking) {
2004       biased_locking_exit(objReg, tmpReg, DONE_LABEL);
2005    }
2006    // Classic stack-locking code ...
2007    // Check whether the displaced header is 0
2008    //(=> recursive unlock)
2009    movptr(tmpReg, Address(boxReg, 0));
2010    testptr(tmpReg, tmpReg);
2011    jccb(Assembler::zero, DONE_LABEL);
2012    // If not recursive lock, reset the header to displaced header
2013    if (os::is_MP()) {
2014      lock();
2015    }
2016    cmpxchgptr(tmpReg, Address(objReg, 0));   // Uses RAX which is box
2017    bind(DONE_LABEL);
2018  } else {
2019    Label DONE_LABEL, Stacked, CheckSucc;
2020
2021    // Critically, the biased locking test must have precedence over
2022    // and appear before the (box->dhw == 0) recursive stack-lock test.
2023    if (UseBiasedLocking && !UseOptoBiasInlining) {
2024       biased_locking_exit(objReg, tmpReg, DONE_LABEL);
2025    }
2026
2027#if INCLUDE_RTM_OPT
2028    if (UseRTMForStackLocks && use_rtm) {
2029      assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2030      Label L_regular_unlock;
2031      movptr(tmpReg, Address(objReg, 0));           // fetch markword
2032      andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2033      cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
2034      jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
2035      xend();                                       // otherwise end...
2036      jmp(DONE_LABEL);                              // ... and we're done
2037      bind(L_regular_unlock);
2038    }
2039#endif
2040
2041    cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
2042    jcc   (Assembler::zero, DONE_LABEL);            // 0 indicates recursive stack-lock
2043    movptr(tmpReg, Address(objReg, 0));             // Examine the object's markword
2044    testptr(tmpReg, markOopDesc::monitor_value);    // Inflated?
2045    jccb  (Assembler::zero, Stacked);
2046
2047    // It's inflated.
2048#if INCLUDE_RTM_OPT
2049    if (use_rtm) {
2050      Label L_regular_inflated_unlock;
2051      int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
2052      movptr(boxReg, Address(tmpReg, owner_offset));
2053      testptr(boxReg, boxReg);
2054      jccb(Assembler::notZero, L_regular_inflated_unlock);
2055      xend();
2056      jmpb(DONE_LABEL);
2057      bind(L_regular_inflated_unlock);
2058    }
2059#endif
2060
2061    // Despite our balanced locking property we still check that m->_owner == Self
2062    // as java routines or native JNI code called by this thread might
2063    // have released the lock.
2064    // Refer to the comments in synchronizer.cpp for how we might encode extra
2065    // state in _succ so we can avoid fetching EntryList|cxq.
2066    //
2067    // I'd like to add more cases in fast_lock() and fast_unlock() --
2068    // such as recursive enter and exit -- but we have to be wary of
2069    // I$ bloat, T$ effects and BP$ effects.
2070    //
2071    // If there's no contention try a 1-0 exit.  That is, exit without
2072    // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
2073    // we detect and recover from the race that the 1-0 exit admits.
2074    //
2075    // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
2076    // before it STs null into _owner, releasing the lock.  Updates
2077    // to data protected by the critical section must be visible before
2078    // we drop the lock (and thus before any other thread could acquire
2079    // the lock and observe the fields protected by the lock).
2080    // IA32's memory-model is SPO, so STs are ordered with respect to
2081    // each other and there's no need for an explicit barrier (fence).
2082    // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
2083#ifndef _LP64
2084    get_thread (boxReg);
2085    if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
2086      // prefetchw [ebx + Offset(_owner)-2]
2087      prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2088    }
2089
2090    // Note that we could employ various encoding schemes to reduce
2091    // the number of loads below (currently 4) to just 2 or 3.
2092    // Refer to the comments in synchronizer.cpp.
2093    // In practice the chain of fetches doesn't seem to impact performance, however.
2094    if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
2095       // Attempt to reduce branch density - AMD's branch predictor.
2096       xorptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2097       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2098       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2099       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2100       jccb  (Assembler::notZero, DONE_LABEL);
2101       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2102       jmpb  (DONE_LABEL);
2103    } else {
2104       xorptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2105       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2106       jccb  (Assembler::notZero, DONE_LABEL);
2107       movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2108       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2109       jccb  (Assembler::notZero, CheckSucc);
2110       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2111       jmpb  (DONE_LABEL);
2112    }
2113
2114    // The Following code fragment (EmitSync & 65536) improves the performance of
2115    // contended applications and contended synchronization microbenchmarks.
2116    // Unfortunately the emission of the code - even though not executed - causes regressions
2117    // in scimark and jetstream, evidently because of $ effects.  Replacing the code
2118    // with an equal number of never-executed NOPs results in the same regression.
2119    // We leave it off by default.
2120
2121    if ((EmitSync & 65536) != 0) {
2122       Label LSuccess, LGoSlowPath ;
2123
2124       bind  (CheckSucc);
2125
2126       // Optional pre-test ... it's safe to elide this
2127       if ((EmitSync & 16) == 0) {
2128          cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2129          jccb  (Assembler::zero, LGoSlowPath);
2130       }
2131
2132       // We have a classic Dekker-style idiom:
2133       //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
2134       // There are a number of ways to implement the barrier:
2135       // (1) lock:andl &m->_owner, 0
2136       //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
2137       //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
2138       //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
2139       // (2) If supported, an explicit MFENCE is appealing.
2140       //     In older IA32 processors MFENCE is slower than lock:add or xchg
2141       //     particularly if the write-buffer is full as might be the case if
2142       //     if stores closely precede the fence or fence-equivalent instruction.
2143       //     In more modern implementations MFENCE appears faster, however.
2144       // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
2145       //     The $lines underlying the top-of-stack should be in M-state.
2146       //     The locked add instruction is serializing, of course.
2147       // (4) Use xchg, which is serializing
2148       //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
2149       // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
2150       //     The integer condition codes will tell us if succ was 0.
2151       //     Since _succ and _owner should reside in the same $line and
2152       //     we just stored into _owner, it's likely that the $line
2153       //     remains in M-state for the lock:orl.
2154       //
2155       // We currently use (3), although it's likely that switching to (2)
2156       // is correct for the future.
2157
2158       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2159       if (os::is_MP()) {
2160          if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
2161            mfence();
2162          } else {
2163            lock (); addptr(Address(rsp, 0), 0);
2164          }
2165       }
2166       // Ratify _succ remains non-null
2167       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0);
2168       jccb  (Assembler::notZero, LSuccess);
2169
2170       xorptr(boxReg, boxReg);                  // box is really EAX
2171       if (os::is_MP()) { lock(); }
2172       cmpxchgptr(rsp, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2173       jccb  (Assembler::notEqual, LSuccess);
2174       // Since we're low on registers we installed rsp as a placeholding in _owner.
2175       // Now install Self over rsp.  This is safe as we're transitioning from
2176       // non-null to non=null
2177       get_thread (boxReg);
2178       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), boxReg);
2179       // Intentional fall-through into LGoSlowPath ...
2180
2181       bind  (LGoSlowPath);
2182       orptr(boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2183       jmpb  (DONE_LABEL);
2184
2185       bind  (LSuccess);
2186       xorptr(boxReg, boxReg);                 // set ICC.ZF=1 to indicate success
2187       jmpb  (DONE_LABEL);
2188    }
2189
2190    bind (Stacked);
2191    // It's not inflated and it's not recursively stack-locked and it's not biased.
2192    // It must be stack-locked.
2193    // Try to reset the header to displaced header.
2194    // The "box" value on the stack is stable, so we can reload
2195    // and be assured we observe the same value as above.
2196    movptr(tmpReg, Address(boxReg, 0));
2197    if (os::is_MP()) {
2198      lock();
2199    }
2200    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2201    // Intention fall-thru into DONE_LABEL
2202
2203    // DONE_LABEL is a hot target - we'd really like to place it at the
2204    // start of cache line by padding with NOPs.
2205    // See the AMD and Intel software optimization manuals for the
2206    // most efficient "long" NOP encodings.
2207    // Unfortunately none of our alignment mechanisms suffice.
2208    if ((EmitSync & 65536) == 0) {
2209       bind (CheckSucc);
2210    }
2211#else // _LP64
2212    // It's inflated
2213    movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2214    xorptr(boxReg, r15_thread);
2215    orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2216    jccb  (Assembler::notZero, DONE_LABEL);
2217    movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2218    orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2219    jccb  (Assembler::notZero, CheckSucc);
2220    movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2221    jmpb  (DONE_LABEL);
2222
2223    if ((EmitSync & 65536) == 0) {
2224      Label LSuccess, LGoSlowPath ;
2225      bind  (CheckSucc);
2226      cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2227      jccb  (Assembler::zero, LGoSlowPath);
2228
2229      // I'd much rather use lock:andl m->_owner, 0 as it's faster than the
2230      // the explicit ST;MEMBAR combination, but masm doesn't currently support
2231      // "ANDQ M,IMM".  Don't use MFENCE here.  lock:add to TOS, xchg, etc
2232      // are all faster when the write buffer is populated.
2233      movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2234      if (os::is_MP()) {
2235         lock (); addl (Address(rsp, 0), 0);
2236      }
2237      cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2238      jccb  (Assembler::notZero, LSuccess);
2239
2240      movptr (boxReg, (int32_t)NULL_WORD);                   // box is really EAX
2241      if (os::is_MP()) { lock(); }
2242      cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2243      jccb  (Assembler::notEqual, LSuccess);
2244      // Intentional fall-through into slow-path
2245
2246      bind  (LGoSlowPath);
2247      orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2248      jmpb  (DONE_LABEL);
2249
2250      bind  (LSuccess);
2251      testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2252      jmpb  (DONE_LABEL);
2253    }
2254
2255    bind  (Stacked);
2256    movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2257    if (os::is_MP()) { lock(); }
2258    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2259
2260    if (EmitSync & 65536) {
2261       bind (CheckSucc);
2262    }
2263#endif
2264    bind(DONE_LABEL);
2265    // Avoid branch to branch on AMD processors
2266    if (EmitSync & 32768) {
2267       nop();
2268    }
2269  }
2270}
2271#endif // COMPILER2
2272
2273void MacroAssembler::c2bool(Register x) {
2274  // implements x == 0 ? 0 : 1
2275  // note: must only look at least-significant byte of x
2276  //       since C-style booleans are stored in one byte
2277  //       only! (was bug)
2278  andl(x, 0xFF);
2279  setb(Assembler::notZero, x);
2280}
2281
2282// Wouldn't need if AddressLiteral version had new name
2283void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2284  Assembler::call(L, rtype);
2285}
2286
2287void MacroAssembler::call(Register entry) {
2288  Assembler::call(entry);
2289}
2290
2291void MacroAssembler::call(AddressLiteral entry) {
2292  if (reachable(entry)) {
2293    Assembler::call_literal(entry.target(), entry.rspec());
2294  } else {
2295    lea(rscratch1, entry);
2296    Assembler::call(rscratch1);
2297  }
2298}
2299
2300void MacroAssembler::ic_call(address entry) {
2301  RelocationHolder rh = virtual_call_Relocation::spec(pc());
2302  movptr(rax, (intptr_t)Universe::non_oop_word());
2303  call(AddressLiteral(entry, rh));
2304}
2305
2306// Implementation of call_VM versions
2307
2308void MacroAssembler::call_VM(Register oop_result,
2309                             address entry_point,
2310                             bool check_exceptions) {
2311  Label C, E;
2312  call(C, relocInfo::none);
2313  jmp(E);
2314
2315  bind(C);
2316  call_VM_helper(oop_result, entry_point, 0, check_exceptions);
2317  ret(0);
2318
2319  bind(E);
2320}
2321
2322void MacroAssembler::call_VM(Register oop_result,
2323                             address entry_point,
2324                             Register arg_1,
2325                             bool check_exceptions) {
2326  Label C, E;
2327  call(C, relocInfo::none);
2328  jmp(E);
2329
2330  bind(C);
2331  pass_arg1(this, arg_1);
2332  call_VM_helper(oop_result, entry_point, 1, check_exceptions);
2333  ret(0);
2334
2335  bind(E);
2336}
2337
2338void MacroAssembler::call_VM(Register oop_result,
2339                             address entry_point,
2340                             Register arg_1,
2341                             Register arg_2,
2342                             bool check_exceptions) {
2343  Label C, E;
2344  call(C, relocInfo::none);
2345  jmp(E);
2346
2347  bind(C);
2348
2349  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2350
2351  pass_arg2(this, arg_2);
2352  pass_arg1(this, arg_1);
2353  call_VM_helper(oop_result, entry_point, 2, check_exceptions);
2354  ret(0);
2355
2356  bind(E);
2357}
2358
2359void MacroAssembler::call_VM(Register oop_result,
2360                             address entry_point,
2361                             Register arg_1,
2362                             Register arg_2,
2363                             Register arg_3,
2364                             bool check_exceptions) {
2365  Label C, E;
2366  call(C, relocInfo::none);
2367  jmp(E);
2368
2369  bind(C);
2370
2371  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2372  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2373  pass_arg3(this, arg_3);
2374
2375  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2376  pass_arg2(this, arg_2);
2377
2378  pass_arg1(this, arg_1);
2379  call_VM_helper(oop_result, entry_point, 3, check_exceptions);
2380  ret(0);
2381
2382  bind(E);
2383}
2384
2385void MacroAssembler::call_VM(Register oop_result,
2386                             Register last_java_sp,
2387                             address entry_point,
2388                             int number_of_arguments,
2389                             bool check_exceptions) {
2390  Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2391  call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2392}
2393
2394void MacroAssembler::call_VM(Register oop_result,
2395                             Register last_java_sp,
2396                             address entry_point,
2397                             Register arg_1,
2398                             bool check_exceptions) {
2399  pass_arg1(this, arg_1);
2400  call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2401}
2402
2403void MacroAssembler::call_VM(Register oop_result,
2404                             Register last_java_sp,
2405                             address entry_point,
2406                             Register arg_1,
2407                             Register arg_2,
2408                             bool check_exceptions) {
2409
2410  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2411  pass_arg2(this, arg_2);
2412  pass_arg1(this, arg_1);
2413  call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2414}
2415
2416void MacroAssembler::call_VM(Register oop_result,
2417                             Register last_java_sp,
2418                             address entry_point,
2419                             Register arg_1,
2420                             Register arg_2,
2421                             Register arg_3,
2422                             bool check_exceptions) {
2423  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2424  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2425  pass_arg3(this, arg_3);
2426  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2427  pass_arg2(this, arg_2);
2428  pass_arg1(this, arg_1);
2429  call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2430}
2431
2432void MacroAssembler::super_call_VM(Register oop_result,
2433                                   Register last_java_sp,
2434                                   address entry_point,
2435                                   int number_of_arguments,
2436                                   bool check_exceptions) {
2437  Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2438  MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2439}
2440
2441void MacroAssembler::super_call_VM(Register oop_result,
2442                                   Register last_java_sp,
2443                                   address entry_point,
2444                                   Register arg_1,
2445                                   bool check_exceptions) {
2446  pass_arg1(this, arg_1);
2447  super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2448}
2449
2450void MacroAssembler::super_call_VM(Register oop_result,
2451                                   Register last_java_sp,
2452                                   address entry_point,
2453                                   Register arg_1,
2454                                   Register arg_2,
2455                                   bool check_exceptions) {
2456
2457  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2458  pass_arg2(this, arg_2);
2459  pass_arg1(this, arg_1);
2460  super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2461}
2462
2463void MacroAssembler::super_call_VM(Register oop_result,
2464                                   Register last_java_sp,
2465                                   address entry_point,
2466                                   Register arg_1,
2467                                   Register arg_2,
2468                                   Register arg_3,
2469                                   bool check_exceptions) {
2470  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2471  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2472  pass_arg3(this, arg_3);
2473  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2474  pass_arg2(this, arg_2);
2475  pass_arg1(this, arg_1);
2476  super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2477}
2478
2479void MacroAssembler::call_VM_base(Register oop_result,
2480                                  Register java_thread,
2481                                  Register last_java_sp,
2482                                  address  entry_point,
2483                                  int      number_of_arguments,
2484                                  bool     check_exceptions) {
2485  // determine java_thread register
2486  if (!java_thread->is_valid()) {
2487#ifdef _LP64
2488    java_thread = r15_thread;
2489#else
2490    java_thread = rdi;
2491    get_thread(java_thread);
2492#endif // LP64
2493  }
2494  // determine last_java_sp register
2495  if (!last_java_sp->is_valid()) {
2496    last_java_sp = rsp;
2497  }
2498  // debugging support
2499  assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
2500  LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2501#ifdef ASSERT
2502  // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2503  // r12 is the heapbase.
2504  LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2505#endif // ASSERT
2506
2507  assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
2508  assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2509
2510  // push java thread (becomes first argument of C function)
2511
2512  NOT_LP64(push(java_thread); number_of_arguments++);
2513  LP64_ONLY(mov(c_rarg0, r15_thread));
2514
2515  // set last Java frame before call
2516  assert(last_java_sp != rbp, "can't use ebp/rbp");
2517
2518  // Only interpreter should have to set fp
2519  set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2520
2521  // do the call, remove parameters
2522  MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2523
2524  // restore the thread (cannot use the pushed argument since arguments
2525  // may be overwritten by C code generated by an optimizing compiler);
2526  // however can use the register value directly if it is callee saved.
2527  if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2528    // rdi & rsi (also r15) are callee saved -> nothing to do
2529#ifdef ASSERT
2530    guarantee(java_thread != rax, "change this code");
2531    push(rax);
2532    { Label L;
2533      get_thread(rax);
2534      cmpptr(java_thread, rax);
2535      jcc(Assembler::equal, L);
2536      STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2537      bind(L);
2538    }
2539    pop(rax);
2540#endif
2541  } else {
2542    get_thread(java_thread);
2543  }
2544  // reset last Java frame
2545  // Only interpreter should have to clear fp
2546  reset_last_Java_frame(java_thread, true, false);
2547
2548#ifndef CC_INTERP
2549   // C++ interp handles this in the interpreter
2550  check_and_handle_popframe(java_thread);
2551  check_and_handle_earlyret(java_thread);
2552#endif /* CC_INTERP */
2553
2554  if (check_exceptions) {
2555    // check for pending exceptions (java_thread is set upon return)
2556    cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2557#ifndef _LP64
2558    jump_cc(Assembler::notEqual,
2559            RuntimeAddress(StubRoutines::forward_exception_entry()));
2560#else
2561    // This used to conditionally jump to forward_exception however it is
2562    // possible if we relocate that the branch will not reach. So we must jump
2563    // around so we can always reach
2564
2565    Label ok;
2566    jcc(Assembler::equal, ok);
2567    jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2568    bind(ok);
2569#endif // LP64
2570  }
2571
2572  // get oop result if there is one and reset the value in the thread
2573  if (oop_result->is_valid()) {
2574    get_vm_result(oop_result, java_thread);
2575  }
2576}
2577
2578void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2579
2580  // Calculate the value for last_Java_sp
2581  // somewhat subtle. call_VM does an intermediate call
2582  // which places a return address on the stack just under the
2583  // stack pointer as the user finsihed with it. This allows
2584  // use to retrieve last_Java_pc from last_Java_sp[-1].
2585  // On 32bit we then have to push additional args on the stack to accomplish
2586  // the actual requested call. On 64bit call_VM only can use register args
2587  // so the only extra space is the return address that call_VM created.
2588  // This hopefully explains the calculations here.
2589
2590#ifdef _LP64
2591  // We've pushed one address, correct last_Java_sp
2592  lea(rax, Address(rsp, wordSize));
2593#else
2594  lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2595#endif // LP64
2596
2597  call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2598
2599}
2600
2601void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2602  call_VM_leaf_base(entry_point, number_of_arguments);
2603}
2604
2605void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2606  pass_arg0(this, arg_0);
2607  call_VM_leaf(entry_point, 1);
2608}
2609
2610void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2611
2612  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2613  pass_arg1(this, arg_1);
2614  pass_arg0(this, arg_0);
2615  call_VM_leaf(entry_point, 2);
2616}
2617
2618void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2619  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2620  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2621  pass_arg2(this, arg_2);
2622  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2623  pass_arg1(this, arg_1);
2624  pass_arg0(this, arg_0);
2625  call_VM_leaf(entry_point, 3);
2626}
2627
2628void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2629  pass_arg0(this, arg_0);
2630  MacroAssembler::call_VM_leaf_base(entry_point, 1);
2631}
2632
2633void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2634
2635  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2636  pass_arg1(this, arg_1);
2637  pass_arg0(this, arg_0);
2638  MacroAssembler::call_VM_leaf_base(entry_point, 2);
2639}
2640
2641void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2642  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2643  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2644  pass_arg2(this, arg_2);
2645  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2646  pass_arg1(this, arg_1);
2647  pass_arg0(this, arg_0);
2648  MacroAssembler::call_VM_leaf_base(entry_point, 3);
2649}
2650
2651void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2652  LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2653  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2654  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2655  pass_arg3(this, arg_3);
2656  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2657  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2658  pass_arg2(this, arg_2);
2659  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2660  pass_arg1(this, arg_1);
2661  pass_arg0(this, arg_0);
2662  MacroAssembler::call_VM_leaf_base(entry_point, 4);
2663}
2664
2665void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2666  movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2667  movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2668  verify_oop(oop_result, "broken oop in call_VM_base");
2669}
2670
2671void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2672  movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2673  movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2674}
2675
2676void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2677}
2678
2679void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2680}
2681
2682void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2683  if (reachable(src1)) {
2684    cmpl(as_Address(src1), imm);
2685  } else {
2686    lea(rscratch1, src1);
2687    cmpl(Address(rscratch1, 0), imm);
2688  }
2689}
2690
2691void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2692  assert(!src2.is_lval(), "use cmpptr");
2693  if (reachable(src2)) {
2694    cmpl(src1, as_Address(src2));
2695  } else {
2696    lea(rscratch1, src2);
2697    cmpl(src1, Address(rscratch1, 0));
2698  }
2699}
2700
2701void MacroAssembler::cmp32(Register src1, int32_t imm) {
2702  Assembler::cmpl(src1, imm);
2703}
2704
2705void MacroAssembler::cmp32(Register src1, Address src2) {
2706  Assembler::cmpl(src1, src2);
2707}
2708
2709void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2710  ucomisd(opr1, opr2);
2711
2712  Label L;
2713  if (unordered_is_less) {
2714    movl(dst, -1);
2715    jcc(Assembler::parity, L);
2716    jcc(Assembler::below , L);
2717    movl(dst, 0);
2718    jcc(Assembler::equal , L);
2719    increment(dst);
2720  } else { // unordered is greater
2721    movl(dst, 1);
2722    jcc(Assembler::parity, L);
2723    jcc(Assembler::above , L);
2724    movl(dst, 0);
2725    jcc(Assembler::equal , L);
2726    decrementl(dst);
2727  }
2728  bind(L);
2729}
2730
2731void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2732  ucomiss(opr1, opr2);
2733
2734  Label L;
2735  if (unordered_is_less) {
2736    movl(dst, -1);
2737    jcc(Assembler::parity, L);
2738    jcc(Assembler::below , L);
2739    movl(dst, 0);
2740    jcc(Assembler::equal , L);
2741    increment(dst);
2742  } else { // unordered is greater
2743    movl(dst, 1);
2744    jcc(Assembler::parity, L);
2745    jcc(Assembler::above , L);
2746    movl(dst, 0);
2747    jcc(Assembler::equal , L);
2748    decrementl(dst);
2749  }
2750  bind(L);
2751}
2752
2753
2754void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2755  if (reachable(src1)) {
2756    cmpb(as_Address(src1), imm);
2757  } else {
2758    lea(rscratch1, src1);
2759    cmpb(Address(rscratch1, 0), imm);
2760  }
2761}
2762
2763void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2764#ifdef _LP64
2765  if (src2.is_lval()) {
2766    movptr(rscratch1, src2);
2767    Assembler::cmpq(src1, rscratch1);
2768  } else if (reachable(src2)) {
2769    cmpq(src1, as_Address(src2));
2770  } else {
2771    lea(rscratch1, src2);
2772    Assembler::cmpq(src1, Address(rscratch1, 0));
2773  }
2774#else
2775  if (src2.is_lval()) {
2776    cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2777  } else {
2778    cmpl(src1, as_Address(src2));
2779  }
2780#endif // _LP64
2781}
2782
2783void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2784  assert(src2.is_lval(), "not a mem-mem compare");
2785#ifdef _LP64
2786  // moves src2's literal address
2787  movptr(rscratch1, src2);
2788  Assembler::cmpq(src1, rscratch1);
2789#else
2790  cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2791#endif // _LP64
2792}
2793
2794void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2795  if (reachable(adr)) {
2796    if (os::is_MP())
2797      lock();
2798    cmpxchgptr(reg, as_Address(adr));
2799  } else {
2800    lea(rscratch1, adr);
2801    if (os::is_MP())
2802      lock();
2803    cmpxchgptr(reg, Address(rscratch1, 0));
2804  }
2805}
2806
2807void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2808  LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2809}
2810
2811void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2812  if (reachable(src)) {
2813    Assembler::comisd(dst, as_Address(src));
2814  } else {
2815    lea(rscratch1, src);
2816    Assembler::comisd(dst, Address(rscratch1, 0));
2817  }
2818}
2819
2820void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2821  if (reachable(src)) {
2822    Assembler::comiss(dst, as_Address(src));
2823  } else {
2824    lea(rscratch1, src);
2825    Assembler::comiss(dst, Address(rscratch1, 0));
2826  }
2827}
2828
2829
2830void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2831  Condition negated_cond = negate_condition(cond);
2832  Label L;
2833  jcc(negated_cond, L);
2834  pushf(); // Preserve flags
2835  atomic_incl(counter_addr);
2836  popf();
2837  bind(L);
2838}
2839
2840int MacroAssembler::corrected_idivl(Register reg) {
2841  // Full implementation of Java idiv and irem; checks for
2842  // special case as described in JVM spec., p.243 & p.271.
2843  // The function returns the (pc) offset of the idivl
2844  // instruction - may be needed for implicit exceptions.
2845  //
2846  //         normal case                           special case
2847  //
2848  // input : rax,: dividend                         min_int
2849  //         reg: divisor   (may not be rax,/rdx)   -1
2850  //
2851  // output: rax,: quotient  (= rax, idiv reg)       min_int
2852  //         rdx: remainder (= rax, irem reg)       0
2853  assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2854  const int min_int = 0x80000000;
2855  Label normal_case, special_case;
2856
2857  // check for special case
2858  cmpl(rax, min_int);
2859  jcc(Assembler::notEqual, normal_case);
2860  xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2861  cmpl(reg, -1);
2862  jcc(Assembler::equal, special_case);
2863
2864  // handle normal case
2865  bind(normal_case);
2866  cdql();
2867  int idivl_offset = offset();
2868  idivl(reg);
2869
2870  // normal and special case exit
2871  bind(special_case);
2872
2873  return idivl_offset;
2874}
2875
2876
2877
2878void MacroAssembler::decrementl(Register reg, int value) {
2879  if (value == min_jint) {subl(reg, value) ; return; }
2880  if (value <  0) { incrementl(reg, -value); return; }
2881  if (value == 0) {                        ; return; }
2882  if (value == 1 && UseIncDec) { decl(reg) ; return; }
2883  /* else */      { subl(reg, value)       ; return; }
2884}
2885
2886void MacroAssembler::decrementl(Address dst, int value) {
2887  if (value == min_jint) {subl(dst, value) ; return; }
2888  if (value <  0) { incrementl(dst, -value); return; }
2889  if (value == 0) {                        ; return; }
2890  if (value == 1 && UseIncDec) { decl(dst) ; return; }
2891  /* else */      { subl(dst, value)       ; return; }
2892}
2893
2894void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2895  assert (shift_value > 0, "illegal shift value");
2896  Label _is_positive;
2897  testl (reg, reg);
2898  jcc (Assembler::positive, _is_positive);
2899  int offset = (1 << shift_value) - 1 ;
2900
2901  if (offset == 1) {
2902    incrementl(reg);
2903  } else {
2904    addl(reg, offset);
2905  }
2906
2907  bind (_is_positive);
2908  sarl(reg, shift_value);
2909}
2910
2911void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2912  if (reachable(src)) {
2913    Assembler::divsd(dst, as_Address(src));
2914  } else {
2915    lea(rscratch1, src);
2916    Assembler::divsd(dst, Address(rscratch1, 0));
2917  }
2918}
2919
2920void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2921  if (reachable(src)) {
2922    Assembler::divss(dst, as_Address(src));
2923  } else {
2924    lea(rscratch1, src);
2925    Assembler::divss(dst, Address(rscratch1, 0));
2926  }
2927}
2928
2929// !defined(COMPILER2) is because of stupid core builds
2930#if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2)
2931void MacroAssembler::empty_FPU_stack() {
2932  if (VM_Version::supports_mmx()) {
2933    emms();
2934  } else {
2935    for (int i = 8; i-- > 0; ) ffree(i);
2936  }
2937}
2938#endif // !LP64 || C1 || !C2
2939
2940
2941// Defines obj, preserves var_size_in_bytes
2942void MacroAssembler::eden_allocate(Register obj,
2943                                   Register var_size_in_bytes,
2944                                   int con_size_in_bytes,
2945                                   Register t1,
2946                                   Label& slow_case) {
2947  assert(obj == rax, "obj must be in rax, for cmpxchg");
2948  assert_different_registers(obj, var_size_in_bytes, t1);
2949  if (!Universe::heap()->supports_inline_contig_alloc()) {
2950    jmp(slow_case);
2951  } else {
2952    Register end = t1;
2953    Label retry;
2954    bind(retry);
2955    ExternalAddress heap_top((address) Universe::heap()->top_addr());
2956    movptr(obj, heap_top);
2957    if (var_size_in_bytes == noreg) {
2958      lea(end, Address(obj, con_size_in_bytes));
2959    } else {
2960      lea(end, Address(obj, var_size_in_bytes, Address::times_1));
2961    }
2962    // if end < obj then we wrapped around => object too long => slow case
2963    cmpptr(end, obj);
2964    jcc(Assembler::below, slow_case);
2965    cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
2966    jcc(Assembler::above, slow_case);
2967    // Compare obj with the top addr, and if still equal, store the new top addr in
2968    // end at the address of the top addr pointer. Sets ZF if was equal, and clears
2969    // it otherwise. Use lock prefix for atomicity on MPs.
2970    locked_cmpxchgptr(end, heap_top);
2971    jcc(Assembler::notEqual, retry);
2972  }
2973}
2974
2975void MacroAssembler::enter() {
2976  push(rbp);
2977  mov(rbp, rsp);
2978}
2979
2980// A 5 byte nop that is safe for patching (see patch_verified_entry)
2981void MacroAssembler::fat_nop() {
2982  if (UseAddressNop) {
2983    addr_nop_5();
2984  } else {
2985    emit_int8(0x26); // es:
2986    emit_int8(0x2e); // cs:
2987    emit_int8(0x64); // fs:
2988    emit_int8(0x65); // gs:
2989    emit_int8((unsigned char)0x90);
2990  }
2991}
2992
2993void MacroAssembler::fcmp(Register tmp) {
2994  fcmp(tmp, 1, true, true);
2995}
2996
2997void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2998  assert(!pop_right || pop_left, "usage error");
2999  if (VM_Version::supports_cmov()) {
3000    assert(tmp == noreg, "unneeded temp");
3001    if (pop_left) {
3002      fucomip(index);
3003    } else {
3004      fucomi(index);
3005    }
3006    if (pop_right) {
3007      fpop();
3008    }
3009  } else {
3010    assert(tmp != noreg, "need temp");
3011    if (pop_left) {
3012      if (pop_right) {
3013        fcompp();
3014      } else {
3015        fcomp(index);
3016      }
3017    } else {
3018      fcom(index);
3019    }
3020    // convert FPU condition into eflags condition via rax,
3021    save_rax(tmp);
3022    fwait(); fnstsw_ax();
3023    sahf();
3024    restore_rax(tmp);
3025  }
3026  // condition codes set as follows:
3027  //
3028  // CF (corresponds to C0) if x < y
3029  // PF (corresponds to C2) if unordered
3030  // ZF (corresponds to C3) if x = y
3031}
3032
3033void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
3034  fcmp2int(dst, unordered_is_less, 1, true, true);
3035}
3036
3037void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
3038  fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
3039  Label L;
3040  if (unordered_is_less) {
3041    movl(dst, -1);
3042    jcc(Assembler::parity, L);
3043    jcc(Assembler::below , L);
3044    movl(dst, 0);
3045    jcc(Assembler::equal , L);
3046    increment(dst);
3047  } else { // unordered is greater
3048    movl(dst, 1);
3049    jcc(Assembler::parity, L);
3050    jcc(Assembler::above , L);
3051    movl(dst, 0);
3052    jcc(Assembler::equal , L);
3053    decrementl(dst);
3054  }
3055  bind(L);
3056}
3057
3058void MacroAssembler::fld_d(AddressLiteral src) {
3059  fld_d(as_Address(src));
3060}
3061
3062void MacroAssembler::fld_s(AddressLiteral src) {
3063  fld_s(as_Address(src));
3064}
3065
3066void MacroAssembler::fld_x(AddressLiteral src) {
3067  Assembler::fld_x(as_Address(src));
3068}
3069
3070void MacroAssembler::fldcw(AddressLiteral src) {
3071  Assembler::fldcw(as_Address(src));
3072}
3073
3074void MacroAssembler::pow_exp_core_encoding() {
3075  // kills rax, rcx, rdx
3076  subptr(rsp,sizeof(jdouble));
3077  // computes 2^X. Stack: X ...
3078  // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
3079  // keep it on the thread's stack to compute 2^int(X) later
3080  // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
3081  // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
3082  fld_s(0);                 // Stack: X X ...
3083  frndint();                // Stack: int(X) X ...
3084  fsuba(1);                 // Stack: int(X) X-int(X) ...
3085  fistp_s(Address(rsp,0));  // move int(X) as integer to thread's stack. Stack: X-int(X) ...
3086  f2xm1();                  // Stack: 2^(X-int(X))-1 ...
3087  fld1();                   // Stack: 1 2^(X-int(X))-1 ...
3088  faddp(1);                 // Stack: 2^(X-int(X))
3089  // computes 2^(int(X)): add exponent bias (1023) to int(X), then
3090  // shift int(X)+1023 to exponent position.
3091  // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
3092  // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
3093  // values so detect them and set result to NaN.
3094  movl(rax,Address(rsp,0));
3095  movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
3096  addl(rax, 1023);
3097  movl(rdx,rax);
3098  shll(rax,20);
3099  // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
3100  addl(rdx,1);
3101  // Check that 1 < int(X)+1023+1 < 2048
3102  // in 3 steps:
3103  // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
3104  // 2- (int(X)+1023+1)&-2048 != 0
3105  // 3- (int(X)+1023+1)&-2048 != 1
3106  // Do 2- first because addl just updated the flags.
3107  cmov32(Assembler::equal,rax,rcx);
3108  cmpl(rdx,1);
3109  cmov32(Assembler::equal,rax,rcx);
3110  testl(rdx,rcx);
3111  cmov32(Assembler::notEqual,rax,rcx);
3112  movl(Address(rsp,4),rax);
3113  movl(Address(rsp,0),0);
3114  fmul_d(Address(rsp,0));   // Stack: 2^X ...
3115  addptr(rsp,sizeof(jdouble));
3116}
3117
3118void MacroAssembler::increase_precision() {
3119  subptr(rsp, BytesPerWord);
3120  fnstcw(Address(rsp, 0));
3121  movl(rax, Address(rsp, 0));
3122  orl(rax, 0x300);
3123  push(rax);
3124  fldcw(Address(rsp, 0));
3125  pop(rax);
3126}
3127
3128void MacroAssembler::restore_precision() {
3129  fldcw(Address(rsp, 0));
3130  addptr(rsp, BytesPerWord);
3131}
3132
3133void MacroAssembler::fast_pow() {
3134  // computes X^Y = 2^(Y * log2(X))
3135  // if fast computation is not possible, result is NaN. Requires
3136  // fallback from user of this macro.
3137  // increase precision for intermediate steps of the computation
3138  BLOCK_COMMENT("fast_pow {");
3139  increase_precision();
3140  fyl2x();                 // Stack: (Y*log2(X)) ...
3141  pow_exp_core_encoding(); // Stack: exp(X) ...
3142  restore_precision();
3143  BLOCK_COMMENT("} fast_pow");
3144}
3145
3146void MacroAssembler::fast_exp() {
3147  // computes exp(X) = 2^(X * log2(e))
3148  // if fast computation is not possible, result is NaN. Requires
3149  // fallback from user of this macro.
3150  // increase precision for intermediate steps of the computation
3151  increase_precision();
3152  fldl2e();                // Stack: log2(e) X ...
3153  fmulp(1);                // Stack: (X*log2(e)) ...
3154  pow_exp_core_encoding(); // Stack: exp(X) ...
3155  restore_precision();
3156}
3157
3158void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
3159  // kills rax, rcx, rdx
3160  // pow and exp needs 2 extra registers on the fpu stack.
3161  Label slow_case, done;
3162  Register tmp = noreg;
3163  if (!VM_Version::supports_cmov()) {
3164    // fcmp needs a temporary so preserve rdx,
3165    tmp = rdx;
3166  }
3167  Register tmp2 = rax;
3168  Register tmp3 = rcx;
3169
3170  if (is_exp) {
3171    // Stack: X
3172    fld_s(0);                   // duplicate argument for runtime call. Stack: X X
3173    fast_exp();                 // Stack: exp(X) X
3174    fcmp(tmp, 0, false, false); // Stack: exp(X) X
3175    // exp(X) not equal to itself: exp(X) is NaN go to slow case.
3176    jcc(Assembler::parity, slow_case);
3177    // get rid of duplicate argument. Stack: exp(X)
3178    if (num_fpu_regs_in_use > 0) {
3179      fxch();
3180      fpop();
3181    } else {
3182      ffree(1);
3183    }
3184    jmp(done);
3185  } else {
3186    // Stack: X Y
3187    Label x_negative, y_odd;
3188
3189    fldz();                     // Stack: 0 X Y
3190    fcmp(tmp, 1, true, false);  // Stack: X Y
3191    jcc(Assembler::above, x_negative);
3192
3193    // X >= 0
3194
3195    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
3196    fld_s(1);                   // Stack: X Y X Y
3197    fast_pow();                 // Stack: X^Y X Y
3198    fcmp(tmp, 0, false, false); // Stack: X^Y X Y
3199    // X^Y not equal to itself: X^Y is NaN go to slow case.
3200    jcc(Assembler::parity, slow_case);
3201    // get rid of duplicate arguments. Stack: X^Y
3202    if (num_fpu_regs_in_use > 0) {
3203      fxch(); fpop();
3204      fxch(); fpop();
3205    } else {
3206      ffree(2);
3207      ffree(1);
3208    }
3209    jmp(done);
3210
3211    // X <= 0
3212    bind(x_negative);
3213
3214    fld_s(1);                   // Stack: Y X Y
3215    frndint();                  // Stack: int(Y) X Y
3216    fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
3217    jcc(Assembler::notEqual, slow_case);
3218
3219    subptr(rsp, 8);
3220
3221    // For X^Y, when X < 0, Y has to be an integer and the final
3222    // result depends on whether it's odd or even. We just checked
3223    // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
3224    // integer to test its parity. If int(Y) is huge and doesn't fit
3225    // in the 64 bit integer range, the integer indefinite value will
3226    // end up in the gp registers. Huge numbers are all even, the
3227    // integer indefinite number is even so it's fine.
3228
3229#ifdef ASSERT
3230    // Let's check we don't end up with an integer indefinite number
3231    // when not expected. First test for huge numbers: check whether
3232    // int(Y)+1 == int(Y) which is true for very large numbers and
3233    // those are all even. A 64 bit integer is guaranteed to not
3234    // overflow for numbers where y+1 != y (when precision is set to
3235    // double precision).
3236    Label y_not_huge;
3237
3238    fld1();                     // Stack: 1 int(Y) X Y
3239    fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
3240
3241#ifdef _LP64
3242    // trip to memory to force the precision down from double extended
3243    // precision
3244    fstp_d(Address(rsp, 0));
3245    fld_d(Address(rsp, 0));
3246#endif
3247
3248    fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
3249#endif
3250
3251    // move int(Y) as 64 bit integer to thread's stack
3252    fistp_d(Address(rsp,0));    // Stack: X Y
3253
3254#ifdef ASSERT
3255    jcc(Assembler::notEqual, y_not_huge);
3256
3257    // Y is huge so we know it's even. It may not fit in a 64 bit
3258    // integer and we don't want the debug code below to see the
3259    // integer indefinite value so overwrite int(Y) on the thread's
3260    // stack with 0.
3261    movl(Address(rsp, 0), 0);
3262    movl(Address(rsp, 4), 0);
3263
3264    bind(y_not_huge);
3265#endif
3266
3267    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
3268    fld_s(1);                   // Stack: X Y X Y
3269    fabs();                     // Stack: abs(X) Y X Y
3270    fast_pow();                 // Stack: abs(X)^Y X Y
3271    fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
3272    // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
3273
3274    pop(tmp2);
3275    NOT_LP64(pop(tmp3));
3276    jcc(Assembler::parity, slow_case);
3277
3278#ifdef ASSERT
3279    // Check that int(Y) is not integer indefinite value (int
3280    // overflow). Shouldn't happen because for values that would
3281    // overflow, 1+int(Y)==Y which was tested earlier.
3282#ifndef _LP64
3283    {
3284      Label integer;
3285      testl(tmp2, tmp2);
3286      jcc(Assembler::notZero, integer);
3287      cmpl(tmp3, 0x80000000);
3288      jcc(Assembler::notZero, integer);
3289      STOP("integer indefinite value shouldn't be seen here");
3290      bind(integer);
3291    }
3292#else
3293    {
3294      Label integer;
3295      mov(tmp3, tmp2); // preserve tmp2 for parity check below
3296      shlq(tmp3, 1);
3297      jcc(Assembler::carryClear, integer);
3298      jcc(Assembler::notZero, integer);
3299      STOP("integer indefinite value shouldn't be seen here");
3300      bind(integer);
3301    }
3302#endif
3303#endif
3304
3305    // get rid of duplicate arguments. Stack: X^Y
3306    if (num_fpu_regs_in_use > 0) {
3307      fxch(); fpop();
3308      fxch(); fpop();
3309    } else {
3310      ffree(2);
3311      ffree(1);
3312    }
3313
3314    testl(tmp2, 1);
3315    jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
3316    // X <= 0, Y even: X^Y = -abs(X)^Y
3317
3318    fchs();                     // Stack: -abs(X)^Y Y
3319    jmp(done);
3320  }
3321
3322  // slow case: runtime call
3323  bind(slow_case);
3324
3325  fpop();                       // pop incorrect result or int(Y)
3326
3327  fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
3328                      is_exp ? 1 : 2, num_fpu_regs_in_use);
3329
3330  // Come here with result in F-TOS
3331  bind(done);
3332}
3333
3334void MacroAssembler::fpop() {
3335  ffree();
3336  fincstp();
3337}
3338
3339void MacroAssembler::fremr(Register tmp) {
3340  save_rax(tmp);
3341  { Label L;
3342    bind(L);
3343    fprem();
3344    fwait(); fnstsw_ax();
3345#ifdef _LP64
3346    testl(rax, 0x400);
3347    jcc(Assembler::notEqual, L);
3348#else
3349    sahf();
3350    jcc(Assembler::parity, L);
3351#endif // _LP64
3352  }
3353  restore_rax(tmp);
3354  // Result is in ST0.
3355  // Note: fxch & fpop to get rid of ST1
3356  // (otherwise FPU stack could overflow eventually)
3357  fxch(1);
3358  fpop();
3359}
3360
3361
3362void MacroAssembler::incrementl(AddressLiteral dst) {
3363  if (reachable(dst)) {
3364    incrementl(as_Address(dst));
3365  } else {
3366    lea(rscratch1, dst);
3367    incrementl(Address(rscratch1, 0));
3368  }
3369}
3370
3371void MacroAssembler::incrementl(ArrayAddress dst) {
3372  incrementl(as_Address(dst));
3373}
3374
3375void MacroAssembler::incrementl(Register reg, int value) {
3376  if (value == min_jint) {addl(reg, value) ; return; }
3377  if (value <  0) { decrementl(reg, -value); return; }
3378  if (value == 0) {                        ; return; }
3379  if (value == 1 && UseIncDec) { incl(reg) ; return; }
3380  /* else */      { addl(reg, value)       ; return; }
3381}
3382
3383void MacroAssembler::incrementl(Address dst, int value) {
3384  if (value == min_jint) {addl(dst, value) ; return; }
3385  if (value <  0) { decrementl(dst, -value); return; }
3386  if (value == 0) {                        ; return; }
3387  if (value == 1 && UseIncDec) { incl(dst) ; return; }
3388  /* else */      { addl(dst, value)       ; return; }
3389}
3390
3391void MacroAssembler::jump(AddressLiteral dst) {
3392  if (reachable(dst)) {
3393    jmp_literal(dst.target(), dst.rspec());
3394  } else {
3395    lea(rscratch1, dst);
3396    jmp(rscratch1);
3397  }
3398}
3399
3400void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3401  if (reachable(dst)) {
3402    InstructionMark im(this);
3403    relocate(dst.reloc());
3404    const int short_size = 2;
3405    const int long_size = 6;
3406    int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3407    if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3408      // 0111 tttn #8-bit disp
3409      emit_int8(0x70 | cc);
3410      emit_int8((offs - short_size) & 0xFF);
3411    } else {
3412      // 0000 1111 1000 tttn #32-bit disp
3413      emit_int8(0x0F);
3414      emit_int8((unsigned char)(0x80 | cc));
3415      emit_int32(offs - long_size);
3416    }
3417  } else {
3418#ifdef ASSERT
3419    warning("reversing conditional branch");
3420#endif /* ASSERT */
3421    Label skip;
3422    jccb(reverse[cc], skip);
3423    lea(rscratch1, dst);
3424    Assembler::jmp(rscratch1);
3425    bind(skip);
3426  }
3427}
3428
3429void MacroAssembler::ldmxcsr(AddressLiteral src) {
3430  if (reachable(src)) {
3431    Assembler::ldmxcsr(as_Address(src));
3432  } else {
3433    lea(rscratch1, src);
3434    Assembler::ldmxcsr(Address(rscratch1, 0));
3435  }
3436}
3437
3438int MacroAssembler::load_signed_byte(Register dst, Address src) {
3439  int off;
3440  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3441    off = offset();
3442    movsbl(dst, src); // movsxb
3443  } else {
3444    off = load_unsigned_byte(dst, src);
3445    shll(dst, 24);
3446    sarl(dst, 24);
3447  }
3448  return off;
3449}
3450
3451// Note: load_signed_short used to be called load_signed_word.
3452// Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3453// manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3454// The term "word" in HotSpot means a 32- or 64-bit machine word.
3455int MacroAssembler::load_signed_short(Register dst, Address src) {
3456  int off;
3457  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3458    // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3459    // version but this is what 64bit has always done. This seems to imply
3460    // that users are only using 32bits worth.
3461    off = offset();
3462    movswl(dst, src); // movsxw
3463  } else {
3464    off = load_unsigned_short(dst, src);
3465    shll(dst, 16);
3466    sarl(dst, 16);
3467  }
3468  return off;
3469}
3470
3471int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3472  // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3473  // and "3.9 Partial Register Penalties", p. 22).
3474  int off;
3475  if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3476    off = offset();
3477    movzbl(dst, src); // movzxb
3478  } else {
3479    xorl(dst, dst);
3480    off = offset();
3481    movb(dst, src);
3482  }
3483  return off;
3484}
3485
3486// Note: load_unsigned_short used to be called load_unsigned_word.
3487int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3488  // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3489  // and "3.9 Partial Register Penalties", p. 22).
3490  int off;
3491  if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3492    off = offset();
3493    movzwl(dst, src); // movzxw
3494  } else {
3495    xorl(dst, dst);
3496    off = offset();
3497    movw(dst, src);
3498  }
3499  return off;
3500}
3501
3502void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3503  switch (size_in_bytes) {
3504#ifndef _LP64
3505  case  8:
3506    assert(dst2 != noreg, "second dest register required");
3507    movl(dst,  src);
3508    movl(dst2, src.plus_disp(BytesPerInt));
3509    break;
3510#else
3511  case  8:  movq(dst, src); break;
3512#endif
3513  case  4:  movl(dst, src); break;
3514  case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3515  case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3516  default:  ShouldNotReachHere();
3517  }
3518}
3519
3520void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3521  switch (size_in_bytes) {
3522#ifndef _LP64
3523  case  8:
3524    assert(src2 != noreg, "second source register required");
3525    movl(dst,                        src);
3526    movl(dst.plus_disp(BytesPerInt), src2);
3527    break;
3528#else
3529  case  8:  movq(dst, src); break;
3530#endif
3531  case  4:  movl(dst, src); break;
3532  case  2:  movw(dst, src); break;
3533  case  1:  movb(dst, src); break;
3534  default:  ShouldNotReachHere();
3535  }
3536}
3537
3538void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3539  if (reachable(dst)) {
3540    movl(as_Address(dst), src);
3541  } else {
3542    lea(rscratch1, dst);
3543    movl(Address(rscratch1, 0), src);
3544  }
3545}
3546
3547void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3548  if (reachable(src)) {
3549    movl(dst, as_Address(src));
3550  } else {
3551    lea(rscratch1, src);
3552    movl(dst, Address(rscratch1, 0));
3553  }
3554}
3555
3556// C++ bool manipulation
3557
3558void MacroAssembler::movbool(Register dst, Address src) {
3559  if(sizeof(bool) == 1)
3560    movb(dst, src);
3561  else if(sizeof(bool) == 2)
3562    movw(dst, src);
3563  else if(sizeof(bool) == 4)
3564    movl(dst, src);
3565  else
3566    // unsupported
3567    ShouldNotReachHere();
3568}
3569
3570void MacroAssembler::movbool(Address dst, bool boolconst) {
3571  if(sizeof(bool) == 1)
3572    movb(dst, (int) boolconst);
3573  else if(sizeof(bool) == 2)
3574    movw(dst, (int) boolconst);
3575  else if(sizeof(bool) == 4)
3576    movl(dst, (int) boolconst);
3577  else
3578    // unsupported
3579    ShouldNotReachHere();
3580}
3581
3582void MacroAssembler::movbool(Address dst, Register src) {
3583  if(sizeof(bool) == 1)
3584    movb(dst, src);
3585  else if(sizeof(bool) == 2)
3586    movw(dst, src);
3587  else if(sizeof(bool) == 4)
3588    movl(dst, src);
3589  else
3590    // unsupported
3591    ShouldNotReachHere();
3592}
3593
3594void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3595  movb(as_Address(dst), src);
3596}
3597
3598void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3599  if (reachable(src)) {
3600    movdl(dst, as_Address(src));
3601  } else {
3602    lea(rscratch1, src);
3603    movdl(dst, Address(rscratch1, 0));
3604  }
3605}
3606
3607void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3608  if (reachable(src)) {
3609    movq(dst, as_Address(src));
3610  } else {
3611    lea(rscratch1, src);
3612    movq(dst, Address(rscratch1, 0));
3613  }
3614}
3615
3616void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3617  if (reachable(src)) {
3618    if (UseXmmLoadAndClearUpper) {
3619      movsd (dst, as_Address(src));
3620    } else {
3621      movlpd(dst, as_Address(src));
3622    }
3623  } else {
3624    lea(rscratch1, src);
3625    if (UseXmmLoadAndClearUpper) {
3626      movsd (dst, Address(rscratch1, 0));
3627    } else {
3628      movlpd(dst, Address(rscratch1, 0));
3629    }
3630  }
3631}
3632
3633void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3634  if (reachable(src)) {
3635    movss(dst, as_Address(src));
3636  } else {
3637    lea(rscratch1, src);
3638    movss(dst, Address(rscratch1, 0));
3639  }
3640}
3641
3642void MacroAssembler::movptr(Register dst, Register src) {
3643  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3644}
3645
3646void MacroAssembler::movptr(Register dst, Address src) {
3647  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3648}
3649
3650// src should NEVER be a real pointer. Use AddressLiteral for true pointers
3651void MacroAssembler::movptr(Register dst, intptr_t src) {
3652  LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3653}
3654
3655void MacroAssembler::movptr(Address dst, Register src) {
3656  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3657}
3658
3659void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
3660  if (reachable(src)) {
3661    Assembler::movdqu(dst, as_Address(src));
3662  } else {
3663    lea(rscratch1, src);
3664    Assembler::movdqu(dst, Address(rscratch1, 0));
3665  }
3666}
3667
3668void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3669  if (reachable(src)) {
3670    Assembler::movdqa(dst, as_Address(src));
3671  } else {
3672    lea(rscratch1, src);
3673    Assembler::movdqa(dst, Address(rscratch1, 0));
3674  }
3675}
3676
3677void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3678  if (reachable(src)) {
3679    Assembler::movsd(dst, as_Address(src));
3680  } else {
3681    lea(rscratch1, src);
3682    Assembler::movsd(dst, Address(rscratch1, 0));
3683  }
3684}
3685
3686void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3687  if (reachable(src)) {
3688    Assembler::movss(dst, as_Address(src));
3689  } else {
3690    lea(rscratch1, src);
3691    Assembler::movss(dst, Address(rscratch1, 0));
3692  }
3693}
3694
3695void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3696  if (reachable(src)) {
3697    Assembler::mulsd(dst, as_Address(src));
3698  } else {
3699    lea(rscratch1, src);
3700    Assembler::mulsd(dst, Address(rscratch1, 0));
3701  }
3702}
3703
3704void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3705  if (reachable(src)) {
3706    Assembler::mulss(dst, as_Address(src));
3707  } else {
3708    lea(rscratch1, src);
3709    Assembler::mulss(dst, Address(rscratch1, 0));
3710  }
3711}
3712
3713void MacroAssembler::null_check(Register reg, int offset) {
3714  if (needs_explicit_null_check(offset)) {
3715    // provoke OS NULL exception if reg = NULL by
3716    // accessing M[reg] w/o changing any (non-CC) registers
3717    // NOTE: cmpl is plenty here to provoke a segv
3718    cmpptr(rax, Address(reg, 0));
3719    // Note: should probably use testl(rax, Address(reg, 0));
3720    //       may be shorter code (however, this version of
3721    //       testl needs to be implemented first)
3722  } else {
3723    // nothing to do, (later) access of M[reg + offset]
3724    // will provoke OS NULL exception if reg = NULL
3725  }
3726}
3727
3728void MacroAssembler::os_breakpoint() {
3729  // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3730  // (e.g., MSVC can't call ps() otherwise)
3731  call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3732}
3733
3734void MacroAssembler::pop_CPU_state() {
3735  pop_FPU_state();
3736  pop_IU_state();
3737}
3738
3739void MacroAssembler::pop_FPU_state() {
3740  NOT_LP64(frstor(Address(rsp, 0));)
3741  LP64_ONLY(fxrstor(Address(rsp, 0));)
3742  addptr(rsp, FPUStateSizeInWords * wordSize);
3743}
3744
3745void MacroAssembler::pop_IU_state() {
3746  popa();
3747  LP64_ONLY(addq(rsp, 8));
3748  popf();
3749}
3750
3751// Save Integer and Float state
3752// Warning: Stack must be 16 byte aligned (64bit)
3753void MacroAssembler::push_CPU_state() {
3754  push_IU_state();
3755  push_FPU_state();
3756}
3757
3758void MacroAssembler::push_FPU_state() {
3759  subptr(rsp, FPUStateSizeInWords * wordSize);
3760#ifndef _LP64
3761  fnsave(Address(rsp, 0));
3762  fwait();
3763#else
3764  fxsave(Address(rsp, 0));
3765#endif // LP64
3766}
3767
3768void MacroAssembler::push_IU_state() {
3769  // Push flags first because pusha kills them
3770  pushf();
3771  // Make sure rsp stays 16-byte aligned
3772  LP64_ONLY(subq(rsp, 8));
3773  pusha();
3774}
3775
3776void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
3777  // determine java_thread register
3778  if (!java_thread->is_valid()) {
3779    java_thread = rdi;
3780    get_thread(java_thread);
3781  }
3782  // we must set sp to zero to clear frame
3783  movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3784  if (clear_fp) {
3785    movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3786  }
3787
3788  if (clear_pc)
3789    movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3790
3791}
3792
3793void MacroAssembler::restore_rax(Register tmp) {
3794  if (tmp == noreg) pop(rax);
3795  else if (tmp != rax) mov(rax, tmp);
3796}
3797
3798void MacroAssembler::round_to(Register reg, int modulus) {
3799  addptr(reg, modulus - 1);
3800  andptr(reg, -modulus);
3801}
3802
3803void MacroAssembler::save_rax(Register tmp) {
3804  if (tmp == noreg) push(rax);
3805  else if (tmp != rax) mov(tmp, rax);
3806}
3807
3808// Write serialization page so VM thread can do a pseudo remote membar.
3809// We use the current thread pointer to calculate a thread specific
3810// offset to write to within the page. This minimizes bus traffic
3811// due to cache line collision.
3812void MacroAssembler::serialize_memory(Register thread, Register tmp) {
3813  movl(tmp, thread);
3814  shrl(tmp, os::get_serialize_page_shift_count());
3815  andl(tmp, (os::vm_page_size() - sizeof(int)));
3816
3817  Address index(noreg, tmp, Address::times_1);
3818  ExternalAddress page(os::get_memory_serialize_page());
3819
3820  // Size of store must match masking code above
3821  movl(as_Address(ArrayAddress(page, index)), tmp);
3822}
3823
3824// Calls to C land
3825//
3826// When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3827// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3828// has to be reset to 0. This is required to allow proper stack traversal.
3829void MacroAssembler::set_last_Java_frame(Register java_thread,
3830                                         Register last_java_sp,
3831                                         Register last_java_fp,
3832                                         address  last_java_pc) {
3833  // determine java_thread register
3834  if (!java_thread->is_valid()) {
3835    java_thread = rdi;
3836    get_thread(java_thread);
3837  }
3838  // determine last_java_sp register
3839  if (!last_java_sp->is_valid()) {
3840    last_java_sp = rsp;
3841  }
3842
3843  // last_java_fp is optional
3844
3845  if (last_java_fp->is_valid()) {
3846    movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3847  }
3848
3849  // last_java_pc is optional
3850
3851  if (last_java_pc != NULL) {
3852    lea(Address(java_thread,
3853                 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3854        InternalAddress(last_java_pc));
3855
3856  }
3857  movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3858}
3859
3860void MacroAssembler::shlptr(Register dst, int imm8) {
3861  LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3862}
3863
3864void MacroAssembler::shrptr(Register dst, int imm8) {
3865  LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3866}
3867
3868void MacroAssembler::sign_extend_byte(Register reg) {
3869  if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3870    movsbl(reg, reg); // movsxb
3871  } else {
3872    shll(reg, 24);
3873    sarl(reg, 24);
3874  }
3875}
3876
3877void MacroAssembler::sign_extend_short(Register reg) {
3878  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3879    movswl(reg, reg); // movsxw
3880  } else {
3881    shll(reg, 16);
3882    sarl(reg, 16);
3883  }
3884}
3885
3886void MacroAssembler::testl(Register dst, AddressLiteral src) {
3887  assert(reachable(src), "Address should be reachable");
3888  testl(dst, as_Address(src));
3889}
3890
3891void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3892  if (reachable(src)) {
3893    Assembler::sqrtsd(dst, as_Address(src));
3894  } else {
3895    lea(rscratch1, src);
3896    Assembler::sqrtsd(dst, Address(rscratch1, 0));
3897  }
3898}
3899
3900void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3901  if (reachable(src)) {
3902    Assembler::sqrtss(dst, as_Address(src));
3903  } else {
3904    lea(rscratch1, src);
3905    Assembler::sqrtss(dst, Address(rscratch1, 0));
3906  }
3907}
3908
3909void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3910  if (reachable(src)) {
3911    Assembler::subsd(dst, as_Address(src));
3912  } else {
3913    lea(rscratch1, src);
3914    Assembler::subsd(dst, Address(rscratch1, 0));
3915  }
3916}
3917
3918void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3919  if (reachable(src)) {
3920    Assembler::subss(dst, as_Address(src));
3921  } else {
3922    lea(rscratch1, src);
3923    Assembler::subss(dst, Address(rscratch1, 0));
3924  }
3925}
3926
3927void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3928  if (reachable(src)) {
3929    Assembler::ucomisd(dst, as_Address(src));
3930  } else {
3931    lea(rscratch1, src);
3932    Assembler::ucomisd(dst, Address(rscratch1, 0));
3933  }
3934}
3935
3936void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3937  if (reachable(src)) {
3938    Assembler::ucomiss(dst, as_Address(src));
3939  } else {
3940    lea(rscratch1, src);
3941    Assembler::ucomiss(dst, Address(rscratch1, 0));
3942  }
3943}
3944
3945void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
3946  // Used in sign-bit flipping with aligned address.
3947  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3948  if (reachable(src)) {
3949    Assembler::xorpd(dst, as_Address(src));
3950  } else {
3951    lea(rscratch1, src);
3952    Assembler::xorpd(dst, Address(rscratch1, 0));
3953  }
3954}
3955
3956void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
3957  // Used in sign-bit flipping with aligned address.
3958  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3959  if (reachable(src)) {
3960    Assembler::xorps(dst, as_Address(src));
3961  } else {
3962    lea(rscratch1, src);
3963    Assembler::xorps(dst, Address(rscratch1, 0));
3964  }
3965}
3966
3967void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3968  // Used in sign-bit flipping with aligned address.
3969  bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3970  assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3971  if (reachable(src)) {
3972    Assembler::pshufb(dst, as_Address(src));
3973  } else {
3974    lea(rscratch1, src);
3975    Assembler::pshufb(dst, Address(rscratch1, 0));
3976  }
3977}
3978
3979// AVX 3-operands instructions
3980
3981void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3982  if (reachable(src)) {
3983    vaddsd(dst, nds, as_Address(src));
3984  } else {
3985    lea(rscratch1, src);
3986    vaddsd(dst, nds, Address(rscratch1, 0));
3987  }
3988}
3989
3990void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3991  if (reachable(src)) {
3992    vaddss(dst, nds, as_Address(src));
3993  } else {
3994    lea(rscratch1, src);
3995    vaddss(dst, nds, Address(rscratch1, 0));
3996  }
3997}
3998
3999void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4000  if (reachable(src)) {
4001    vandpd(dst, nds, as_Address(src), vector256);
4002  } else {
4003    lea(rscratch1, src);
4004    vandpd(dst, nds, Address(rscratch1, 0), vector256);
4005  }
4006}
4007
4008void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4009  if (reachable(src)) {
4010    vandps(dst, nds, as_Address(src), vector256);
4011  } else {
4012    lea(rscratch1, src);
4013    vandps(dst, nds, Address(rscratch1, 0), vector256);
4014  }
4015}
4016
4017void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4018  if (reachable(src)) {
4019    vdivsd(dst, nds, as_Address(src));
4020  } else {
4021    lea(rscratch1, src);
4022    vdivsd(dst, nds, Address(rscratch1, 0));
4023  }
4024}
4025
4026void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4027  if (reachable(src)) {
4028    vdivss(dst, nds, as_Address(src));
4029  } else {
4030    lea(rscratch1, src);
4031    vdivss(dst, nds, Address(rscratch1, 0));
4032  }
4033}
4034
4035void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4036  if (reachable(src)) {
4037    vmulsd(dst, nds, as_Address(src));
4038  } else {
4039    lea(rscratch1, src);
4040    vmulsd(dst, nds, Address(rscratch1, 0));
4041  }
4042}
4043
4044void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4045  if (reachable(src)) {
4046    vmulss(dst, nds, as_Address(src));
4047  } else {
4048    lea(rscratch1, src);
4049    vmulss(dst, nds, Address(rscratch1, 0));
4050  }
4051}
4052
4053void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4054  if (reachable(src)) {
4055    vsubsd(dst, nds, as_Address(src));
4056  } else {
4057    lea(rscratch1, src);
4058    vsubsd(dst, nds, Address(rscratch1, 0));
4059  }
4060}
4061
4062void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4063  if (reachable(src)) {
4064    vsubss(dst, nds, as_Address(src));
4065  } else {
4066    lea(rscratch1, src);
4067    vsubss(dst, nds, Address(rscratch1, 0));
4068  }
4069}
4070
4071void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4072  if (reachable(src)) {
4073    vxorpd(dst, nds, as_Address(src), vector256);
4074  } else {
4075    lea(rscratch1, src);
4076    vxorpd(dst, nds, Address(rscratch1, 0), vector256);
4077  }
4078}
4079
4080void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4081  if (reachable(src)) {
4082    vxorps(dst, nds, as_Address(src), vector256);
4083  } else {
4084    lea(rscratch1, src);
4085    vxorps(dst, nds, Address(rscratch1, 0), vector256);
4086  }
4087}
4088
4089
4090//////////////////////////////////////////////////////////////////////////////////
4091#if INCLUDE_ALL_GCS
4092
4093void MacroAssembler::g1_write_barrier_pre(Register obj,
4094                                          Register pre_val,
4095                                          Register thread,
4096                                          Register tmp,
4097                                          bool tosca_live,
4098                                          bool expand_call) {
4099
4100  // If expand_call is true then we expand the call_VM_leaf macro
4101  // directly to skip generating the check by
4102  // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
4103
4104#ifdef _LP64
4105  assert(thread == r15_thread, "must be");
4106#endif // _LP64
4107
4108  Label done;
4109  Label runtime;
4110
4111  assert(pre_val != noreg, "check this code");
4112
4113  if (obj != noreg) {
4114    assert_different_registers(obj, pre_val, tmp);
4115    assert(pre_val != rax, "check this code");
4116  }
4117
4118  Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
4119                                       PtrQueue::byte_offset_of_active()));
4120  Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
4121                                       PtrQueue::byte_offset_of_index()));
4122  Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
4123                                       PtrQueue::byte_offset_of_buf()));
4124
4125
4126  // Is marking active?
4127  if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
4128    cmpl(in_progress, 0);
4129  } else {
4130    assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
4131    cmpb(in_progress, 0);
4132  }
4133  jcc(Assembler::equal, done);
4134
4135  // Do we need to load the previous value?
4136  if (obj != noreg) {
4137    load_heap_oop(pre_val, Address(obj, 0));
4138  }
4139
4140  // Is the previous value null?
4141  cmpptr(pre_val, (int32_t) NULL_WORD);
4142  jcc(Assembler::equal, done);
4143
4144  // Can we store original value in the thread's buffer?
4145  // Is index == 0?
4146  // (The index field is typed as size_t.)
4147
4148  movptr(tmp, index);                   // tmp := *index_adr
4149  cmpptr(tmp, 0);                       // tmp == 0?
4150  jcc(Assembler::equal, runtime);       // If yes, goto runtime
4151
4152  subptr(tmp, wordSize);                // tmp := tmp - wordSize
4153  movptr(index, tmp);                   // *index_adr := tmp
4154  addptr(tmp, buffer);                  // tmp := tmp + *buffer_adr
4155
4156  // Record the previous value
4157  movptr(Address(tmp, 0), pre_val);
4158  jmp(done);
4159
4160  bind(runtime);
4161  // save the live input values
4162  if(tosca_live) push(rax);
4163
4164  if (obj != noreg && obj != rax)
4165    push(obj);
4166
4167  if (pre_val != rax)
4168    push(pre_val);
4169
4170  // Calling the runtime using the regular call_VM_leaf mechanism generates
4171  // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
4172  // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
4173  //
4174  // If we care generating the pre-barrier without a frame (e.g. in the
4175  // intrinsified Reference.get() routine) then ebp might be pointing to
4176  // the caller frame and so this check will most likely fail at runtime.
4177  //
4178  // Expanding the call directly bypasses the generation of the check.
4179  // So when we do not have have a full interpreter frame on the stack
4180  // expand_call should be passed true.
4181
4182  NOT_LP64( push(thread); )
4183
4184  if (expand_call) {
4185    LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
4186    pass_arg1(this, thread);
4187    pass_arg0(this, pre_val);
4188    MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
4189  } else {
4190    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
4191  }
4192
4193  NOT_LP64( pop(thread); )
4194
4195  // save the live input values
4196  if (pre_val != rax)
4197    pop(pre_val);
4198
4199  if (obj != noreg && obj != rax)
4200    pop(obj);
4201
4202  if(tosca_live) pop(rax);
4203
4204  bind(done);
4205}
4206
4207void MacroAssembler::g1_write_barrier_post(Register store_addr,
4208                                           Register new_val,
4209                                           Register thread,
4210                                           Register tmp,
4211                                           Register tmp2) {
4212#ifdef _LP64
4213  assert(thread == r15_thread, "must be");
4214#endif // _LP64
4215
4216  Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
4217                                       PtrQueue::byte_offset_of_index()));
4218  Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
4219                                       PtrQueue::byte_offset_of_buf()));
4220
4221  BarrierSet* bs = Universe::heap()->barrier_set();
4222  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
4223  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
4224
4225  Label done;
4226  Label runtime;
4227
4228  // Does store cross heap regions?
4229
4230  movptr(tmp, store_addr);
4231  xorptr(tmp, new_val);
4232  shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
4233  jcc(Assembler::equal, done);
4234
4235  // crosses regions, storing NULL?
4236
4237  cmpptr(new_val, (int32_t) NULL_WORD);
4238  jcc(Assembler::equal, done);
4239
4240  // storing region crossing non-NULL, is card already dirty?
4241
4242  const Register card_addr = tmp;
4243  const Register cardtable = tmp2;
4244
4245  movptr(card_addr, store_addr);
4246  shrptr(card_addr, CardTableModRefBS::card_shift);
4247  // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
4248  // a valid address and therefore is not properly handled by the relocation code.
4249  movptr(cardtable, (intptr_t)ct->byte_map_base);
4250  addptr(card_addr, cardtable);
4251
4252  cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val());
4253  jcc(Assembler::equal, done);
4254
4255  membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
4256  cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
4257  jcc(Assembler::equal, done);
4258
4259
4260  // storing a region crossing, non-NULL oop, card is clean.
4261  // dirty card and log.
4262
4263  movb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
4264
4265  cmpl(queue_index, 0);
4266  jcc(Assembler::equal, runtime);
4267  subl(queue_index, wordSize);
4268  movptr(tmp2, buffer);
4269#ifdef _LP64
4270  movslq(rscratch1, queue_index);
4271  addq(tmp2, rscratch1);
4272  movq(Address(tmp2, 0), card_addr);
4273#else
4274  addl(tmp2, queue_index);
4275  movl(Address(tmp2, 0), card_addr);
4276#endif
4277  jmp(done);
4278
4279  bind(runtime);
4280  // save the live input values
4281  push(store_addr);
4282  push(new_val);
4283#ifdef _LP64
4284  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
4285#else
4286  push(thread);
4287  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
4288  pop(thread);
4289#endif
4290  pop(new_val);
4291  pop(store_addr);
4292
4293  bind(done);
4294}
4295
4296#endif // INCLUDE_ALL_GCS
4297//////////////////////////////////////////////////////////////////////////////////
4298
4299
4300void MacroAssembler::store_check(Register obj) {
4301  // Does a store check for the oop in register obj. The content of
4302  // register obj is destroyed afterwards.
4303  store_check_part_1(obj);
4304  store_check_part_2(obj);
4305}
4306
4307void MacroAssembler::store_check(Register obj, Address dst) {
4308  store_check(obj);
4309}
4310
4311
4312// split the store check operation so that other instructions can be scheduled inbetween
4313void MacroAssembler::store_check_part_1(Register obj) {
4314  BarrierSet* bs = Universe::heap()->barrier_set();
4315  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
4316  shrptr(obj, CardTableModRefBS::card_shift);
4317}
4318
4319void MacroAssembler::store_check_part_2(Register obj) {
4320  BarrierSet* bs = Universe::heap()->barrier_set();
4321  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
4322  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
4323  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
4324
4325  // The calculation for byte_map_base is as follows:
4326  // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
4327  // So this essentially converts an address to a displacement and it will
4328  // never need to be relocated. On 64bit however the value may be too
4329  // large for a 32bit displacement.
4330  intptr_t disp = (intptr_t) ct->byte_map_base;
4331  if (is_simm32(disp)) {
4332    Address cardtable(noreg, obj, Address::times_1, disp);
4333    movb(cardtable, 0);
4334  } else {
4335    // By doing it as an ExternalAddress 'disp' could be converted to a rip-relative
4336    // displacement and done in a single instruction given favorable mapping and a
4337    // smarter version of as_Address. However, 'ExternalAddress' generates a relocation
4338    // entry and that entry is not properly handled by the relocation code.
4339    AddressLiteral cardtable((address)ct->byte_map_base, relocInfo::none);
4340    Address index(noreg, obj, Address::times_1);
4341    movb(as_Address(ArrayAddress(cardtable, index)), 0);
4342  }
4343}
4344
4345void MacroAssembler::subptr(Register dst, int32_t imm32) {
4346  LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
4347}
4348
4349// Force generation of a 4 byte immediate value even if it fits into 8bit
4350void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
4351  LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
4352}
4353
4354void MacroAssembler::subptr(Register dst, Register src) {
4355  LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
4356}
4357
4358// C++ bool manipulation
4359void MacroAssembler::testbool(Register dst) {
4360  if(sizeof(bool) == 1)
4361    testb(dst, 0xff);
4362  else if(sizeof(bool) == 2) {
4363    // testw implementation needed for two byte bools
4364    ShouldNotReachHere();
4365  } else if(sizeof(bool) == 4)
4366    testl(dst, dst);
4367  else
4368    // unsupported
4369    ShouldNotReachHere();
4370}
4371
4372void MacroAssembler::testptr(Register dst, Register src) {
4373  LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
4374}
4375
4376// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4377void MacroAssembler::tlab_allocate(Register obj,
4378                                   Register var_size_in_bytes,
4379                                   int con_size_in_bytes,
4380                                   Register t1,
4381                                   Register t2,
4382                                   Label& slow_case) {
4383  assert_different_registers(obj, t1, t2);
4384  assert_different_registers(obj, var_size_in_bytes, t1);
4385  Register end = t2;
4386  Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
4387
4388  verify_tlab();
4389
4390  NOT_LP64(get_thread(thread));
4391
4392  movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
4393  if (var_size_in_bytes == noreg) {
4394    lea(end, Address(obj, con_size_in_bytes));
4395  } else {
4396    lea(end, Address(obj, var_size_in_bytes, Address::times_1));
4397  }
4398  cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
4399  jcc(Assembler::above, slow_case);
4400
4401  // update the tlab top pointer
4402  movptr(Address(thread, JavaThread::tlab_top_offset()), end);
4403
4404  // recover var_size_in_bytes if necessary
4405  if (var_size_in_bytes == end) {
4406    subptr(var_size_in_bytes, obj);
4407  }
4408  verify_tlab();
4409}
4410
4411// Preserves rbx, and rdx.
4412Register MacroAssembler::tlab_refill(Label& retry,
4413                                     Label& try_eden,
4414                                     Label& slow_case) {
4415  Register top = rax;
4416  Register t1  = rcx;
4417  Register t2  = rsi;
4418  Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
4419  assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
4420  Label do_refill, discard_tlab;
4421
4422  if (!Universe::heap()->supports_inline_contig_alloc()) {
4423    // No allocation in the shared eden.
4424    jmp(slow_case);
4425  }
4426
4427  NOT_LP64(get_thread(thread_reg));
4428
4429  movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4430  movptr(t1,  Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4431
4432  // calculate amount of free space
4433  subptr(t1, top);
4434  shrptr(t1, LogHeapWordSize);
4435
4436  // Retain tlab and allocate object in shared space if
4437  // the amount free in the tlab is too large to discard.
4438  cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
4439  jcc(Assembler::lessEqual, discard_tlab);
4440
4441  // Retain
4442  // %%% yuck as movptr...
4443  movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
4444  addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2);
4445  if (TLABStats) {
4446    // increment number of slow_allocations
4447    addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1);
4448  }
4449  jmp(try_eden);
4450
4451  bind(discard_tlab);
4452  if (TLABStats) {
4453    // increment number of refills
4454    addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1);
4455    // accumulate wastage -- t1 is amount free in tlab
4456    addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1);
4457  }
4458
4459  // if tlab is currently allocated (top or end != null) then
4460  // fill [top, end + alignment_reserve) with array object
4461  testptr(top, top);
4462  jcc(Assembler::zero, do_refill);
4463
4464  // set up the mark word
4465  movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
4466  // set the length to the remaining space
4467  subptr(t1, typeArrayOopDesc::header_size(T_INT));
4468  addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
4469  shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
4470  movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
4471  // set klass to intArrayKlass
4472  // dubious reloc why not an oop reloc?
4473  movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
4474  // store klass last.  concurrent gcs assumes klass length is valid if
4475  // klass field is not null.
4476  store_klass(top, t1);
4477
4478  movptr(t1, top);
4479  subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4480  incr_allocated_bytes(thread_reg, t1, 0);
4481
4482  // refill the tlab with an eden allocation
4483  bind(do_refill);
4484  movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
4485  shlptr(t1, LogHeapWordSize);
4486  // allocate new tlab, address returned in top
4487  eden_allocate(top, t1, 0, t2, slow_case);
4488
4489  // Check that t1 was preserved in eden_allocate.
4490#ifdef ASSERT
4491  if (UseTLAB) {
4492    Label ok;
4493    Register tsize = rsi;
4494    assert_different_registers(tsize, thread_reg, t1);
4495    push(tsize);
4496    movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
4497    shlptr(tsize, LogHeapWordSize);
4498    cmpptr(t1, tsize);
4499    jcc(Assembler::equal, ok);
4500    STOP("assert(t1 != tlab size)");
4501    should_not_reach_here();
4502
4503    bind(ok);
4504    pop(tsize);
4505  }
4506#endif
4507  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top);
4508  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top);
4509  addptr(top, t1);
4510  subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
4511  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
4512  verify_tlab();
4513  jmp(retry);
4514
4515  return thread_reg; // for use by caller
4516}
4517
4518void MacroAssembler::incr_allocated_bytes(Register thread,
4519                                          Register var_size_in_bytes,
4520                                          int con_size_in_bytes,
4521                                          Register t1) {
4522  if (!thread->is_valid()) {
4523#ifdef _LP64
4524    thread = r15_thread;
4525#else
4526    assert(t1->is_valid(), "need temp reg");
4527    thread = t1;
4528    get_thread(thread);
4529#endif
4530  }
4531
4532#ifdef _LP64
4533  if (var_size_in_bytes->is_valid()) {
4534    addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
4535  } else {
4536    addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
4537  }
4538#else
4539  if (var_size_in_bytes->is_valid()) {
4540    addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
4541  } else {
4542    addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
4543  }
4544  adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
4545#endif
4546}
4547
4548void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
4549  pusha();
4550
4551  // if we are coming from c1, xmm registers may be live
4552  int off = 0;
4553  if (UseSSE == 1)  {
4554    subptr(rsp, sizeof(jdouble)*8);
4555    movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
4556    movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
4557    movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
4558    movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
4559    movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
4560    movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
4561    movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
4562    movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
4563  } else if (UseSSE >= 2)  {
4564#ifdef COMPILER2
4565    if (MaxVectorSize > 16) {
4566      assert(UseAVX > 0, "256bit vectors are supported only with AVX");
4567      // Save upper half of YMM registes
4568      subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4569      vextractf128h(Address(rsp,  0),xmm0);
4570      vextractf128h(Address(rsp, 16),xmm1);
4571      vextractf128h(Address(rsp, 32),xmm2);
4572      vextractf128h(Address(rsp, 48),xmm3);
4573      vextractf128h(Address(rsp, 64),xmm4);
4574      vextractf128h(Address(rsp, 80),xmm5);
4575      vextractf128h(Address(rsp, 96),xmm6);
4576      vextractf128h(Address(rsp,112),xmm7);
4577#ifdef _LP64
4578      vextractf128h(Address(rsp,128),xmm8);
4579      vextractf128h(Address(rsp,144),xmm9);
4580      vextractf128h(Address(rsp,160),xmm10);
4581      vextractf128h(Address(rsp,176),xmm11);
4582      vextractf128h(Address(rsp,192),xmm12);
4583      vextractf128h(Address(rsp,208),xmm13);
4584      vextractf128h(Address(rsp,224),xmm14);
4585      vextractf128h(Address(rsp,240),xmm15);
4586#endif
4587    }
4588#endif
4589    // Save whole 128bit (16 bytes) XMM regiters
4590    subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4591    movdqu(Address(rsp,off++*16),xmm0);
4592    movdqu(Address(rsp,off++*16),xmm1);
4593    movdqu(Address(rsp,off++*16),xmm2);
4594    movdqu(Address(rsp,off++*16),xmm3);
4595    movdqu(Address(rsp,off++*16),xmm4);
4596    movdqu(Address(rsp,off++*16),xmm5);
4597    movdqu(Address(rsp,off++*16),xmm6);
4598    movdqu(Address(rsp,off++*16),xmm7);
4599#ifdef _LP64
4600    movdqu(Address(rsp,off++*16),xmm8);
4601    movdqu(Address(rsp,off++*16),xmm9);
4602    movdqu(Address(rsp,off++*16),xmm10);
4603    movdqu(Address(rsp,off++*16),xmm11);
4604    movdqu(Address(rsp,off++*16),xmm12);
4605    movdqu(Address(rsp,off++*16),xmm13);
4606    movdqu(Address(rsp,off++*16),xmm14);
4607    movdqu(Address(rsp,off++*16),xmm15);
4608#endif
4609  }
4610
4611  // Preserve registers across runtime call
4612  int incoming_argument_and_return_value_offset = -1;
4613  if (num_fpu_regs_in_use > 1) {
4614    // Must preserve all other FPU regs (could alternatively convert
4615    // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
4616    // FPU state, but can not trust C compiler)
4617    NEEDS_CLEANUP;
4618    // NOTE that in this case we also push the incoming argument(s) to
4619    // the stack and restore it later; we also use this stack slot to
4620    // hold the return value from dsin, dcos etc.
4621    for (int i = 0; i < num_fpu_regs_in_use; i++) {
4622      subptr(rsp, sizeof(jdouble));
4623      fstp_d(Address(rsp, 0));
4624    }
4625    incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
4626    for (int i = nb_args-1; i >= 0; i--) {
4627      fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
4628    }
4629  }
4630
4631  subptr(rsp, nb_args*sizeof(jdouble));
4632  for (int i = 0; i < nb_args; i++) {
4633    fstp_d(Address(rsp, i*sizeof(jdouble)));
4634  }
4635
4636#ifdef _LP64
4637  if (nb_args > 0) {
4638    movdbl(xmm0, Address(rsp, 0));
4639  }
4640  if (nb_args > 1) {
4641    movdbl(xmm1, Address(rsp, sizeof(jdouble)));
4642  }
4643  assert(nb_args <= 2, "unsupported number of args");
4644#endif // _LP64
4645
4646  // NOTE: we must not use call_VM_leaf here because that requires a
4647  // complete interpreter frame in debug mode -- same bug as 4387334
4648  // MacroAssembler::call_VM_leaf_base is perfectly safe and will
4649  // do proper 64bit abi
4650
4651  NEEDS_CLEANUP;
4652  // Need to add stack banging before this runtime call if it needs to
4653  // be taken; however, there is no generic stack banging routine at
4654  // the MacroAssembler level
4655
4656  MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
4657
4658#ifdef _LP64
4659  movsd(Address(rsp, 0), xmm0);
4660  fld_d(Address(rsp, 0));
4661#endif // _LP64
4662  addptr(rsp, sizeof(jdouble) * nb_args);
4663  if (num_fpu_regs_in_use > 1) {
4664    // Must save return value to stack and then restore entire FPU
4665    // stack except incoming arguments
4666    fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
4667    for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
4668      fld_d(Address(rsp, 0));
4669      addptr(rsp, sizeof(jdouble));
4670    }
4671    fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
4672    addptr(rsp, sizeof(jdouble) * nb_args);
4673  }
4674
4675  off = 0;
4676  if (UseSSE == 1)  {
4677    movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
4678    movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
4679    movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
4680    movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
4681    movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
4682    movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
4683    movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
4684    movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
4685    addptr(rsp, sizeof(jdouble)*8);
4686  } else if (UseSSE >= 2)  {
4687    // Restore whole 128bit (16 bytes) XMM regiters
4688    movdqu(xmm0, Address(rsp,off++*16));
4689    movdqu(xmm1, Address(rsp,off++*16));
4690    movdqu(xmm2, Address(rsp,off++*16));
4691    movdqu(xmm3, Address(rsp,off++*16));
4692    movdqu(xmm4, Address(rsp,off++*16));
4693    movdqu(xmm5, Address(rsp,off++*16));
4694    movdqu(xmm6, Address(rsp,off++*16));
4695    movdqu(xmm7, Address(rsp,off++*16));
4696#ifdef _LP64
4697    movdqu(xmm8, Address(rsp,off++*16));
4698    movdqu(xmm9, Address(rsp,off++*16));
4699    movdqu(xmm10, Address(rsp,off++*16));
4700    movdqu(xmm11, Address(rsp,off++*16));
4701    movdqu(xmm12, Address(rsp,off++*16));
4702    movdqu(xmm13, Address(rsp,off++*16));
4703    movdqu(xmm14, Address(rsp,off++*16));
4704    movdqu(xmm15, Address(rsp,off++*16));
4705#endif
4706    addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4707#ifdef COMPILER2
4708    if (MaxVectorSize > 16) {
4709      // Restore upper half of YMM registes.
4710      vinsertf128h(xmm0, Address(rsp,  0));
4711      vinsertf128h(xmm1, Address(rsp, 16));
4712      vinsertf128h(xmm2, Address(rsp, 32));
4713      vinsertf128h(xmm3, Address(rsp, 48));
4714      vinsertf128h(xmm4, Address(rsp, 64));
4715      vinsertf128h(xmm5, Address(rsp, 80));
4716      vinsertf128h(xmm6, Address(rsp, 96));
4717      vinsertf128h(xmm7, Address(rsp,112));
4718#ifdef _LP64
4719      vinsertf128h(xmm8, Address(rsp,128));
4720      vinsertf128h(xmm9, Address(rsp,144));
4721      vinsertf128h(xmm10, Address(rsp,160));
4722      vinsertf128h(xmm11, Address(rsp,176));
4723      vinsertf128h(xmm12, Address(rsp,192));
4724      vinsertf128h(xmm13, Address(rsp,208));
4725      vinsertf128h(xmm14, Address(rsp,224));
4726      vinsertf128h(xmm15, Address(rsp,240));
4727#endif
4728      addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4729    }
4730#endif
4731  }
4732  popa();
4733}
4734
4735static const double     pi_4 =  0.7853981633974483;
4736
4737void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
4738  // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
4739  // was attempted in this code; unfortunately it appears that the
4740  // switch to 80-bit precision and back causes this to be
4741  // unprofitable compared with simply performing a runtime call if
4742  // the argument is out of the (-pi/4, pi/4) range.
4743
4744  Register tmp = noreg;
4745  if (!VM_Version::supports_cmov()) {
4746    // fcmp needs a temporary so preserve rbx,
4747    tmp = rbx;
4748    push(tmp);
4749  }
4750
4751  Label slow_case, done;
4752
4753  ExternalAddress pi4_adr = (address)&pi_4;
4754  if (reachable(pi4_adr)) {
4755    // x ?<= pi/4
4756    fld_d(pi4_adr);
4757    fld_s(1);                // Stack:  X  PI/4  X
4758    fabs();                  // Stack: |X| PI/4  X
4759    fcmp(tmp);
4760    jcc(Assembler::above, slow_case);
4761
4762    // fastest case: -pi/4 <= x <= pi/4
4763    switch(trig) {
4764    case 's':
4765      fsin();
4766      break;
4767    case 'c':
4768      fcos();
4769      break;
4770    case 't':
4771      ftan();
4772      break;
4773    default:
4774      assert(false, "bad intrinsic");
4775      break;
4776    }
4777    jmp(done);
4778  }
4779
4780  // slow case: runtime call
4781  bind(slow_case);
4782
4783  switch(trig) {
4784  case 's':
4785    {
4786      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
4787    }
4788    break;
4789  case 'c':
4790    {
4791      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
4792    }
4793    break;
4794  case 't':
4795    {
4796      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
4797    }
4798    break;
4799  default:
4800    assert(false, "bad intrinsic");
4801    break;
4802  }
4803
4804  // Come here with result in F-TOS
4805  bind(done);
4806
4807  if (tmp != noreg) {
4808    pop(tmp);
4809  }
4810}
4811
4812
4813// Look up the method for a megamorphic invokeinterface call.
4814// The target method is determined by <intf_klass, itable_index>.
4815// The receiver klass is in recv_klass.
4816// On success, the result will be in method_result, and execution falls through.
4817// On failure, execution transfers to the given label.
4818void MacroAssembler::lookup_interface_method(Register recv_klass,
4819                                             Register intf_klass,
4820                                             RegisterOrConstant itable_index,
4821                                             Register method_result,
4822                                             Register scan_temp,
4823                                             Label& L_no_such_interface) {
4824  assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
4825  assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4826         "caller must use same register for non-constant itable index as for method");
4827
4828  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4829  int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
4830  int itentry_off = itableMethodEntry::method_offset_in_bytes();
4831  int scan_step   = itableOffsetEntry::size() * wordSize;
4832  int vte_size    = vtableEntry::size() * wordSize;
4833  Address::ScaleFactor times_vte_scale = Address::times_ptr;
4834  assert(vte_size == wordSize, "else adjust times_vte_scale");
4835
4836  movl(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
4837
4838  // %%% Could store the aligned, prescaled offset in the klassoop.
4839  lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4840  if (HeapWordsPerLong > 1) {
4841    // Round up to align_object_offset boundary
4842    // see code for InstanceKlass::start_of_itable!
4843    round_to(scan_temp, BytesPerLong);
4844  }
4845
4846  // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4847  assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4848  lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4849
4850  // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
4851  //   if (scan->interface() == intf) {
4852  //     result = (klass + scan->offset() + itable_index);
4853  //   }
4854  // }
4855  Label search, found_method;
4856
4857  for (int peel = 1; peel >= 0; peel--) {
4858    movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
4859    cmpptr(intf_klass, method_result);
4860
4861    if (peel) {
4862      jccb(Assembler::equal, found_method);
4863    } else {
4864      jccb(Assembler::notEqual, search);
4865      // (invert the test to fall through to found_method...)
4866    }
4867
4868    if (!peel)  break;
4869
4870    bind(search);
4871
4872    // Check that the previous entry is non-null.  A null entry means that
4873    // the receiver class doesn't implement the interface, and wasn't the
4874    // same as when the caller was compiled.
4875    testptr(method_result, method_result);
4876    jcc(Assembler::zero, L_no_such_interface);
4877    addptr(scan_temp, scan_step);
4878  }
4879
4880  bind(found_method);
4881
4882  // Got a hit.
4883  movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
4884  movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4885}
4886
4887
4888// virtual method calling
4889void MacroAssembler::lookup_virtual_method(Register recv_klass,
4890                                           RegisterOrConstant vtable_index,
4891                                           Register method_result) {
4892  const int base = InstanceKlass::vtable_start_offset() * wordSize;
4893  assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4894  Address vtable_entry_addr(recv_klass,
4895                            vtable_index, Address::times_ptr,
4896                            base + vtableEntry::method_offset_in_bytes());
4897  movptr(method_result, vtable_entry_addr);
4898}
4899
4900
4901void MacroAssembler::check_klass_subtype(Register sub_klass,
4902                           Register super_klass,
4903                           Register temp_reg,
4904                           Label& L_success) {
4905  Label L_failure;
4906  check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
4907  check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
4908  bind(L_failure);
4909}
4910
4911
4912void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4913                                                   Register super_klass,
4914                                                   Register temp_reg,
4915                                                   Label* L_success,
4916                                                   Label* L_failure,
4917                                                   Label* L_slow_path,
4918                                        RegisterOrConstant super_check_offset) {
4919  assert_different_registers(sub_klass, super_klass, temp_reg);
4920  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4921  if (super_check_offset.is_register()) {
4922    assert_different_registers(sub_klass, super_klass,
4923                               super_check_offset.as_register());
4924  } else if (must_load_sco) {
4925    assert(temp_reg != noreg, "supply either a temp or a register offset");
4926  }
4927
4928  Label L_fallthrough;
4929  int label_nulls = 0;
4930  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4931  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4932  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
4933  assert(label_nulls <= 1, "at most one NULL in the batch");
4934
4935  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4936  int sco_offset = in_bytes(Klass::super_check_offset_offset());
4937  Address super_check_offset_addr(super_klass, sco_offset);
4938
4939  // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4940  // range of a jccb.  If this routine grows larger, reconsider at
4941  // least some of these.
4942#define local_jcc(assembler_cond, label)                                \
4943  if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
4944  else                             jcc( assembler_cond, label) /*omit semi*/
4945
4946  // Hacked jmp, which may only be used just before L_fallthrough.
4947#define final_jmp(label)                                                \
4948  if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4949  else                            jmp(label)                /*omit semi*/
4950
4951  // If the pointers are equal, we are done (e.g., String[] elements).
4952  // This self-check enables sharing of secondary supertype arrays among
4953  // non-primary types such as array-of-interface.  Otherwise, each such
4954  // type would need its own customized SSA.
4955  // We move this check to the front of the fast path because many
4956  // type checks are in fact trivially successful in this manner,
4957  // so we get a nicely predicted branch right at the start of the check.
4958  cmpptr(sub_klass, super_klass);
4959  local_jcc(Assembler::equal, *L_success);
4960
4961  // Check the supertype display:
4962  if (must_load_sco) {
4963    // Positive movl does right thing on LP64.
4964    movl(temp_reg, super_check_offset_addr);
4965    super_check_offset = RegisterOrConstant(temp_reg);
4966  }
4967  Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4968  cmpptr(super_klass, super_check_addr); // load displayed supertype
4969
4970  // This check has worked decisively for primary supers.
4971  // Secondary supers are sought in the super_cache ('super_cache_addr').
4972  // (Secondary supers are interfaces and very deeply nested subtypes.)
4973  // This works in the same check above because of a tricky aliasing
4974  // between the super_cache and the primary super display elements.
4975  // (The 'super_check_addr' can address either, as the case requires.)
4976  // Note that the cache is updated below if it does not help us find
4977  // what we need immediately.
4978  // So if it was a primary super, we can just fail immediately.
4979  // Otherwise, it's the slow path for us (no success at this point).
4980
4981  if (super_check_offset.is_register()) {
4982    local_jcc(Assembler::equal, *L_success);
4983    cmpl(super_check_offset.as_register(), sc_offset);
4984    if (L_failure == &L_fallthrough) {
4985      local_jcc(Assembler::equal, *L_slow_path);
4986    } else {
4987      local_jcc(Assembler::notEqual, *L_failure);
4988      final_jmp(*L_slow_path);
4989    }
4990  } else if (super_check_offset.as_constant() == sc_offset) {
4991    // Need a slow path; fast failure is impossible.
4992    if (L_slow_path == &L_fallthrough) {
4993      local_jcc(Assembler::equal, *L_success);
4994    } else {
4995      local_jcc(Assembler::notEqual, *L_slow_path);
4996      final_jmp(*L_success);
4997    }
4998  } else {
4999    // No slow path; it's a fast decision.
5000    if (L_failure == &L_fallthrough) {
5001      local_jcc(Assembler::equal, *L_success);
5002    } else {
5003      local_jcc(Assembler::notEqual, *L_failure);
5004      final_jmp(*L_success);
5005    }
5006  }
5007
5008  bind(L_fallthrough);
5009
5010#undef local_jcc
5011#undef final_jmp
5012}
5013
5014
5015void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
5016                                                   Register super_klass,
5017                                                   Register temp_reg,
5018                                                   Register temp2_reg,
5019                                                   Label* L_success,
5020                                                   Label* L_failure,
5021                                                   bool set_cond_codes) {
5022  assert_different_registers(sub_klass, super_klass, temp_reg);
5023  if (temp2_reg != noreg)
5024    assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
5025#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
5026
5027  Label L_fallthrough;
5028  int label_nulls = 0;
5029  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
5030  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
5031  assert(label_nulls <= 1, "at most one NULL in the batch");
5032
5033  // a couple of useful fields in sub_klass:
5034  int ss_offset = in_bytes(Klass::secondary_supers_offset());
5035  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
5036  Address secondary_supers_addr(sub_klass, ss_offset);
5037  Address super_cache_addr(     sub_klass, sc_offset);
5038
5039  // Do a linear scan of the secondary super-klass chain.
5040  // This code is rarely used, so simplicity is a virtue here.
5041  // The repne_scan instruction uses fixed registers, which we must spill.
5042  // Don't worry too much about pre-existing connections with the input regs.
5043
5044  assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
5045  assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
5046
5047  // Get super_klass value into rax (even if it was in rdi or rcx).
5048  bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
5049  if (super_klass != rax || UseCompressedOops) {
5050    if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
5051    mov(rax, super_klass);
5052  }
5053  if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
5054  if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
5055
5056#ifndef PRODUCT
5057  int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
5058  ExternalAddress pst_counter_addr((address) pst_counter);
5059  NOT_LP64(  incrementl(pst_counter_addr) );
5060  LP64_ONLY( lea(rcx, pst_counter_addr) );
5061  LP64_ONLY( incrementl(Address(rcx, 0)) );
5062#endif //PRODUCT
5063
5064  // We will consult the secondary-super array.
5065  movptr(rdi, secondary_supers_addr);
5066  // Load the array length.  (Positive movl does right thing on LP64.)
5067  movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
5068  // Skip to start of data.
5069  addptr(rdi, Array<Klass*>::base_offset_in_bytes());
5070
5071  // Scan RCX words at [RDI] for an occurrence of RAX.
5072  // Set NZ/Z based on last compare.
5073  // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
5074  // not change flags (only scas instruction which is repeated sets flags).
5075  // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
5076
5077    testptr(rax,rax); // Set Z = 0
5078    repne_scan();
5079
5080  // Unspill the temp. registers:
5081  if (pushed_rdi)  pop(rdi);
5082  if (pushed_rcx)  pop(rcx);
5083  if (pushed_rax)  pop(rax);
5084
5085  if (set_cond_codes) {
5086    // Special hack for the AD files:  rdi is guaranteed non-zero.
5087    assert(!pushed_rdi, "rdi must be left non-NULL");
5088    // Also, the condition codes are properly set Z/NZ on succeed/failure.
5089  }
5090
5091  if (L_failure == &L_fallthrough)
5092        jccb(Assembler::notEqual, *L_failure);
5093  else  jcc(Assembler::notEqual, *L_failure);
5094
5095  // Success.  Cache the super we found and proceed in triumph.
5096  movptr(super_cache_addr, super_klass);
5097
5098  if (L_success != &L_fallthrough) {
5099    jmp(*L_success);
5100  }
5101
5102#undef IS_A_TEMP
5103
5104  bind(L_fallthrough);
5105}
5106
5107
5108void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
5109  if (VM_Version::supports_cmov()) {
5110    cmovl(cc, dst, src);
5111  } else {
5112    Label L;
5113    jccb(negate_condition(cc), L);
5114    movl(dst, src);
5115    bind(L);
5116  }
5117}
5118
5119void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
5120  if (VM_Version::supports_cmov()) {
5121    cmovl(cc, dst, src);
5122  } else {
5123    Label L;
5124    jccb(negate_condition(cc), L);
5125    movl(dst, src);
5126    bind(L);
5127  }
5128}
5129
5130void MacroAssembler::verify_oop(Register reg, const char* s) {
5131  if (!VerifyOops) return;
5132
5133  // Pass register number to verify_oop_subroutine
5134  const char* b = NULL;
5135  {
5136    ResourceMark rm;
5137    stringStream ss;
5138    ss.print("verify_oop: %s: %s", reg->name(), s);
5139    b = code_string(ss.as_string());
5140  }
5141  BLOCK_COMMENT("verify_oop {");
5142#ifdef _LP64
5143  push(rscratch1);                    // save r10, trashed by movptr()
5144#endif
5145  push(rax);                          // save rax,
5146  push(reg);                          // pass register argument
5147  ExternalAddress buffer((address) b);
5148  // avoid using pushptr, as it modifies scratch registers
5149  // and our contract is not to modify anything
5150  movptr(rax, buffer.addr());
5151  push(rax);
5152  // call indirectly to solve generation ordering problem
5153  movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5154  call(rax);
5155  // Caller pops the arguments (oop, message) and restores rax, r10
5156  BLOCK_COMMENT("} verify_oop");
5157}
5158
5159
5160RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
5161                                                      Register tmp,
5162                                                      int offset) {
5163  intptr_t value = *delayed_value_addr;
5164  if (value != 0)
5165    return RegisterOrConstant(value + offset);
5166
5167  // load indirectly to solve generation ordering problem
5168  movptr(tmp, ExternalAddress((address) delayed_value_addr));
5169
5170#ifdef ASSERT
5171  { Label L;
5172    testptr(tmp, tmp);
5173    if (WizardMode) {
5174      const char* buf = NULL;
5175      {
5176        ResourceMark rm;
5177        stringStream ss;
5178        ss.print("DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]);
5179        buf = code_string(ss.as_string());
5180      }
5181      jcc(Assembler::notZero, L);
5182      STOP(buf);
5183    } else {
5184      jccb(Assembler::notZero, L);
5185      hlt();
5186    }
5187    bind(L);
5188  }
5189#endif
5190
5191  if (offset != 0)
5192    addptr(tmp, offset);
5193
5194  return RegisterOrConstant(tmp);
5195}
5196
5197
5198Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
5199                                         int extra_slot_offset) {
5200  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
5201  int stackElementSize = Interpreter::stackElementSize;
5202  int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
5203#ifdef ASSERT
5204  int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
5205  assert(offset1 - offset == stackElementSize, "correct arithmetic");
5206#endif
5207  Register             scale_reg    = noreg;
5208  Address::ScaleFactor scale_factor = Address::no_scale;
5209  if (arg_slot.is_constant()) {
5210    offset += arg_slot.as_constant() * stackElementSize;
5211  } else {
5212    scale_reg    = arg_slot.as_register();
5213    scale_factor = Address::times(stackElementSize);
5214  }
5215  offset += wordSize;           // return PC is on stack
5216  return Address(rsp, scale_reg, scale_factor, offset);
5217}
5218
5219
5220void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
5221  if (!VerifyOops) return;
5222
5223  // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
5224  // Pass register number to verify_oop_subroutine
5225  const char* b = NULL;
5226  {
5227    ResourceMark rm;
5228    stringStream ss;
5229    ss.print("verify_oop_addr: %s", s);
5230    b = code_string(ss.as_string());
5231  }
5232#ifdef _LP64
5233  push(rscratch1);                    // save r10, trashed by movptr()
5234#endif
5235  push(rax);                          // save rax,
5236  // addr may contain rsp so we will have to adjust it based on the push
5237  // we just did (and on 64 bit we do two pushes)
5238  // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
5239  // stores rax into addr which is backwards of what was intended.
5240  if (addr.uses(rsp)) {
5241    lea(rax, addr);
5242    pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
5243  } else {
5244    pushptr(addr);
5245  }
5246
5247  ExternalAddress buffer((address) b);
5248  // pass msg argument
5249  // avoid using pushptr, as it modifies scratch registers
5250  // and our contract is not to modify anything
5251  movptr(rax, buffer.addr());
5252  push(rax);
5253
5254  // call indirectly to solve generation ordering problem
5255  movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5256  call(rax);
5257  // Caller pops the arguments (addr, message) and restores rax, r10.
5258}
5259
5260void MacroAssembler::verify_tlab() {
5261#ifdef ASSERT
5262  if (UseTLAB && VerifyOops) {
5263    Label next, ok;
5264    Register t1 = rsi;
5265    Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
5266
5267    push(t1);
5268    NOT_LP64(push(thread_reg));
5269    NOT_LP64(get_thread(thread_reg));
5270
5271    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5272    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
5273    jcc(Assembler::aboveEqual, next);
5274    STOP("assert(top >= start)");
5275    should_not_reach_here();
5276
5277    bind(next);
5278    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
5279    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5280    jcc(Assembler::aboveEqual, ok);
5281    STOP("assert(top <= end)");
5282    should_not_reach_here();
5283
5284    bind(ok);
5285    NOT_LP64(pop(thread_reg));
5286    pop(t1);
5287  }
5288#endif
5289}
5290
5291class ControlWord {
5292 public:
5293  int32_t _value;
5294
5295  int  rounding_control() const        { return  (_value >> 10) & 3      ; }
5296  int  precision_control() const       { return  (_value >>  8) & 3      ; }
5297  bool precision() const               { return ((_value >>  5) & 1) != 0; }
5298  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
5299  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
5300  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
5301  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
5302  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
5303
5304  void print() const {
5305    // rounding control
5306    const char* rc;
5307    switch (rounding_control()) {
5308      case 0: rc = "round near"; break;
5309      case 1: rc = "round down"; break;
5310      case 2: rc = "round up  "; break;
5311      case 3: rc = "chop      "; break;
5312    };
5313    // precision control
5314    const char* pc;
5315    switch (precision_control()) {
5316      case 0: pc = "24 bits "; break;
5317      case 1: pc = "reserved"; break;
5318      case 2: pc = "53 bits "; break;
5319      case 3: pc = "64 bits "; break;
5320    };
5321    // flags
5322    char f[9];
5323    f[0] = ' ';
5324    f[1] = ' ';
5325    f[2] = (precision   ()) ? 'P' : 'p';
5326    f[3] = (underflow   ()) ? 'U' : 'u';
5327    f[4] = (overflow    ()) ? 'O' : 'o';
5328    f[5] = (zero_divide ()) ? 'Z' : 'z';
5329    f[6] = (denormalized()) ? 'D' : 'd';
5330    f[7] = (invalid     ()) ? 'I' : 'i';
5331    f[8] = '\x0';
5332    // output
5333    printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
5334  }
5335
5336};
5337
5338class StatusWord {
5339 public:
5340  int32_t _value;
5341
5342  bool busy() const                    { return ((_value >> 15) & 1) != 0; }
5343  bool C3() const                      { return ((_value >> 14) & 1) != 0; }
5344  bool C2() const                      { return ((_value >> 10) & 1) != 0; }
5345  bool C1() const                      { return ((_value >>  9) & 1) != 0; }
5346  bool C0() const                      { return ((_value >>  8) & 1) != 0; }
5347  int  top() const                     { return  (_value >> 11) & 7      ; }
5348  bool error_status() const            { return ((_value >>  7) & 1) != 0; }
5349  bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
5350  bool precision() const               { return ((_value >>  5) & 1) != 0; }
5351  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
5352  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
5353  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
5354  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
5355  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
5356
5357  void print() const {
5358    // condition codes
5359    char c[5];
5360    c[0] = (C3()) ? '3' : '-';
5361    c[1] = (C2()) ? '2' : '-';
5362    c[2] = (C1()) ? '1' : '-';
5363    c[3] = (C0()) ? '0' : '-';
5364    c[4] = '\x0';
5365    // flags
5366    char f[9];
5367    f[0] = (error_status()) ? 'E' : '-';
5368    f[1] = (stack_fault ()) ? 'S' : '-';
5369    f[2] = (precision   ()) ? 'P' : '-';
5370    f[3] = (underflow   ()) ? 'U' : '-';
5371    f[4] = (overflow    ()) ? 'O' : '-';
5372    f[5] = (zero_divide ()) ? 'Z' : '-';
5373    f[6] = (denormalized()) ? 'D' : '-';
5374    f[7] = (invalid     ()) ? 'I' : '-';
5375    f[8] = '\x0';
5376    // output
5377    printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
5378  }
5379
5380};
5381
5382class TagWord {
5383 public:
5384  int32_t _value;
5385
5386  int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
5387
5388  void print() const {
5389    printf("%04x", _value & 0xFFFF);
5390  }
5391
5392};
5393
5394class FPU_Register {
5395 public:
5396  int32_t _m0;
5397  int32_t _m1;
5398  int16_t _ex;
5399
5400  bool is_indefinite() const           {
5401    return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
5402  }
5403
5404  void print() const {
5405    char  sign = (_ex < 0) ? '-' : '+';
5406    const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
5407    printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
5408  };
5409
5410};
5411
5412class FPU_State {
5413 public:
5414  enum {
5415    register_size       = 10,
5416    number_of_registers =  8,
5417    register_mask       =  7
5418  };
5419
5420  ControlWord  _control_word;
5421  StatusWord   _status_word;
5422  TagWord      _tag_word;
5423  int32_t      _error_offset;
5424  int32_t      _error_selector;
5425  int32_t      _data_offset;
5426  int32_t      _data_selector;
5427  int8_t       _register[register_size * number_of_registers];
5428
5429  int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5430  FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
5431
5432  const char* tag_as_string(int tag) const {
5433    switch (tag) {
5434      case 0: return "valid";
5435      case 1: return "zero";
5436      case 2: return "special";
5437      case 3: return "empty";
5438    }
5439    ShouldNotReachHere();
5440    return NULL;
5441  }
5442
5443  void print() const {
5444    // print computation registers
5445    { int t = _status_word.top();
5446      for (int i = 0; i < number_of_registers; i++) {
5447        int j = (i - t) & register_mask;
5448        printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5449        st(j)->print();
5450        printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5451      }
5452    }
5453    printf("\n");
5454    // print control registers
5455    printf("ctrl = "); _control_word.print(); printf("\n");
5456    printf("stat = "); _status_word .print(); printf("\n");
5457    printf("tags = "); _tag_word    .print(); printf("\n");
5458  }
5459
5460};
5461
5462class Flag_Register {
5463 public:
5464  int32_t _value;
5465
5466  bool overflow() const                { return ((_value >> 11) & 1) != 0; }
5467  bool direction() const               { return ((_value >> 10) & 1) != 0; }
5468  bool sign() const                    { return ((_value >>  7) & 1) != 0; }
5469  bool zero() const                    { return ((_value >>  6) & 1) != 0; }
5470  bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
5471  bool parity() const                  { return ((_value >>  2) & 1) != 0; }
5472  bool carry() const                   { return ((_value >>  0) & 1) != 0; }
5473
5474  void print() const {
5475    // flags
5476    char f[8];
5477    f[0] = (overflow       ()) ? 'O' : '-';
5478    f[1] = (direction      ()) ? 'D' : '-';
5479    f[2] = (sign           ()) ? 'S' : '-';
5480    f[3] = (zero           ()) ? 'Z' : '-';
5481    f[4] = (auxiliary_carry()) ? 'A' : '-';
5482    f[5] = (parity         ()) ? 'P' : '-';
5483    f[6] = (carry          ()) ? 'C' : '-';
5484    f[7] = '\x0';
5485    // output
5486    printf("%08x  flags = %s", _value, f);
5487  }
5488
5489};
5490
5491class IU_Register {
5492 public:
5493  int32_t _value;
5494
5495  void print() const {
5496    printf("%08x  %11d", _value, _value);
5497  }
5498
5499};
5500
5501class IU_State {
5502 public:
5503  Flag_Register _eflags;
5504  IU_Register   _rdi;
5505  IU_Register   _rsi;
5506  IU_Register   _rbp;
5507  IU_Register   _rsp;
5508  IU_Register   _rbx;
5509  IU_Register   _rdx;
5510  IU_Register   _rcx;
5511  IU_Register   _rax;
5512
5513  void print() const {
5514    // computation registers
5515    printf("rax,  = "); _rax.print(); printf("\n");
5516    printf("rbx,  = "); _rbx.print(); printf("\n");
5517    printf("rcx  = "); _rcx.print(); printf("\n");
5518    printf("rdx  = "); _rdx.print(); printf("\n");
5519    printf("rdi  = "); _rdi.print(); printf("\n");
5520    printf("rsi  = "); _rsi.print(); printf("\n");
5521    printf("rbp,  = "); _rbp.print(); printf("\n");
5522    printf("rsp  = "); _rsp.print(); printf("\n");
5523    printf("\n");
5524    // control registers
5525    printf("flgs = "); _eflags.print(); printf("\n");
5526  }
5527};
5528
5529
5530class CPU_State {
5531 public:
5532  FPU_State _fpu_state;
5533  IU_State  _iu_state;
5534
5535  void print() const {
5536    printf("--------------------------------------------------\n");
5537    _iu_state .print();
5538    printf("\n");
5539    _fpu_state.print();
5540    printf("--------------------------------------------------\n");
5541  }
5542
5543};
5544
5545
5546static void _print_CPU_state(CPU_State* state) {
5547  state->print();
5548};
5549
5550
5551void MacroAssembler::print_CPU_state() {
5552  push_CPU_state();
5553  push(rsp);                // pass CPU state
5554  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5555  addptr(rsp, wordSize);       // discard argument
5556  pop_CPU_state();
5557}
5558
5559
5560static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5561  static int counter = 0;
5562  FPU_State* fs = &state->_fpu_state;
5563  counter++;
5564  // For leaf calls, only verify that the top few elements remain empty.
5565  // We only need 1 empty at the top for C2 code.
5566  if( stack_depth < 0 ) {
5567    if( fs->tag_for_st(7) != 3 ) {
5568      printf("FPR7 not empty\n");
5569      state->print();
5570      assert(false, "error");
5571      return false;
5572    }
5573    return true;                // All other stack states do not matter
5574  }
5575
5576  assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
5577         "bad FPU control word");
5578
5579  // compute stack depth
5580  int i = 0;
5581  while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
5582  int d = i;
5583  while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5584  // verify findings
5585  if (i != FPU_State::number_of_registers) {
5586    // stack not contiguous
5587    printf("%s: stack not contiguous at ST%d\n", s, i);
5588    state->print();
5589    assert(false, "error");
5590    return false;
5591  }
5592  // check if computed stack depth corresponds to expected stack depth
5593  if (stack_depth < 0) {
5594    // expected stack depth is -stack_depth or less
5595    if (d > -stack_depth) {
5596      // too many elements on the stack
5597      printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5598      state->print();
5599      assert(false, "error");
5600      return false;
5601    }
5602  } else {
5603    // expected stack depth is stack_depth
5604    if (d != stack_depth) {
5605      // wrong stack depth
5606      printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5607      state->print();
5608      assert(false, "error");
5609      return false;
5610    }
5611  }
5612  // everything is cool
5613  return true;
5614}
5615
5616
5617void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5618  if (!VerifyFPU) return;
5619  push_CPU_state();
5620  push(rsp);                // pass CPU state
5621  ExternalAddress msg((address) s);
5622  // pass message string s
5623  pushptr(msg.addr());
5624  push(stack_depth);        // pass stack depth
5625  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5626  addptr(rsp, 3 * wordSize);   // discard arguments
5627  // check for error
5628  { Label L;
5629    testl(rax, rax);
5630    jcc(Assembler::notZero, L);
5631    int3();                  // break if error condition
5632    bind(L);
5633  }
5634  pop_CPU_state();
5635}
5636
5637void MacroAssembler::restore_cpu_control_state_after_jni() {
5638  // Either restore the MXCSR register after returning from the JNI Call
5639  // or verify that it wasn't changed (with -Xcheck:jni flag).
5640  if (VM_Version::supports_sse()) {
5641    if (RestoreMXCSROnJNICalls) {
5642      ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
5643    } else if (CheckJNICalls) {
5644      call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5645    }
5646  }
5647  if (VM_Version::supports_avx()) {
5648    // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5649    vzeroupper();
5650  }
5651
5652#ifndef _LP64
5653  // Either restore the x87 floating pointer control word after returning
5654  // from the JNI call or verify that it wasn't changed.
5655  if (CheckJNICalls) {
5656    call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5657  }
5658#endif // _LP64
5659}
5660
5661
5662void MacroAssembler::load_klass(Register dst, Register src) {
5663#ifdef _LP64
5664  if (UseCompressedClassPointers) {
5665    movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5666    decode_klass_not_null(dst);
5667  } else
5668#endif
5669    movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5670}
5671
5672void MacroAssembler::load_prototype_header(Register dst, Register src) {
5673  load_klass(dst, src);
5674  movptr(dst, Address(dst, Klass::prototype_header_offset()));
5675}
5676
5677void MacroAssembler::store_klass(Register dst, Register src) {
5678#ifdef _LP64
5679  if (UseCompressedClassPointers) {
5680    encode_klass_not_null(src);
5681    movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5682  } else
5683#endif
5684    movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5685}
5686
5687void MacroAssembler::load_heap_oop(Register dst, Address src) {
5688#ifdef _LP64
5689  // FIXME: Must change all places where we try to load the klass.
5690  if (UseCompressedOops) {
5691    movl(dst, src);
5692    decode_heap_oop(dst);
5693  } else
5694#endif
5695    movptr(dst, src);
5696}
5697
5698// Doesn't do verfication, generates fixed size code
5699void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
5700#ifdef _LP64
5701  if (UseCompressedOops) {
5702    movl(dst, src);
5703    decode_heap_oop_not_null(dst);
5704  } else
5705#endif
5706    movptr(dst, src);
5707}
5708
5709void MacroAssembler::store_heap_oop(Address dst, Register src) {
5710#ifdef _LP64
5711  if (UseCompressedOops) {
5712    assert(!dst.uses(src), "not enough registers");
5713    encode_heap_oop(src);
5714    movl(dst, src);
5715  } else
5716#endif
5717    movptr(dst, src);
5718}
5719
5720void MacroAssembler::cmp_heap_oop(Register src1, Address src2, Register tmp) {
5721  assert_different_registers(src1, tmp);
5722#ifdef _LP64
5723  if (UseCompressedOops) {
5724    bool did_push = false;
5725    if (tmp == noreg) {
5726      tmp = rax;
5727      push(tmp);
5728      did_push = true;
5729      assert(!src2.uses(rsp), "can't push");
5730    }
5731    load_heap_oop(tmp, src2);
5732    cmpptr(src1, tmp);
5733    if (did_push)  pop(tmp);
5734  } else
5735#endif
5736    cmpptr(src1, src2);
5737}
5738
5739// Used for storing NULLs.
5740void MacroAssembler::store_heap_oop_null(Address dst) {
5741#ifdef _LP64
5742  if (UseCompressedOops) {
5743    movl(dst, (int32_t)NULL_WORD);
5744  } else {
5745    movslq(dst, (int32_t)NULL_WORD);
5746  }
5747#else
5748  movl(dst, (int32_t)NULL_WORD);
5749#endif
5750}
5751
5752#ifdef _LP64
5753void MacroAssembler::store_klass_gap(Register dst, Register src) {
5754  if (UseCompressedClassPointers) {
5755    // Store to klass gap in destination
5756    movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5757  }
5758}
5759
5760#ifdef ASSERT
5761void MacroAssembler::verify_heapbase(const char* msg) {
5762  assert (UseCompressedOops, "should be compressed");
5763  assert (Universe::heap() != NULL, "java heap should be initialized");
5764  if (CheckCompressedOops) {
5765    Label ok;
5766    push(rscratch1); // cmpptr trashes rscratch1
5767    cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
5768    jcc(Assembler::equal, ok);
5769    STOP(msg);
5770    bind(ok);
5771    pop(rscratch1);
5772  }
5773}
5774#endif
5775
5776// Algorithm must match oop.inline.hpp encode_heap_oop.
5777void MacroAssembler::encode_heap_oop(Register r) {
5778#ifdef ASSERT
5779  verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5780#endif
5781  verify_oop(r, "broken oop in encode_heap_oop");
5782  if (Universe::narrow_oop_base() == NULL) {
5783    if (Universe::narrow_oop_shift() != 0) {
5784      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5785      shrq(r, LogMinObjAlignmentInBytes);
5786    }
5787    return;
5788  }
5789  testq(r, r);
5790  cmovq(Assembler::equal, r, r12_heapbase);
5791  subq(r, r12_heapbase);
5792  shrq(r, LogMinObjAlignmentInBytes);
5793}
5794
5795void MacroAssembler::encode_heap_oop_not_null(Register r) {
5796#ifdef ASSERT
5797  verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5798  if (CheckCompressedOops) {
5799    Label ok;
5800    testq(r, r);
5801    jcc(Assembler::notEqual, ok);
5802    STOP("null oop passed to encode_heap_oop_not_null");
5803    bind(ok);
5804  }
5805#endif
5806  verify_oop(r, "broken oop in encode_heap_oop_not_null");
5807  if (Universe::narrow_oop_base() != NULL) {
5808    subq(r, r12_heapbase);
5809  }
5810  if (Universe::narrow_oop_shift() != 0) {
5811    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5812    shrq(r, LogMinObjAlignmentInBytes);
5813  }
5814}
5815
5816void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5817#ifdef ASSERT
5818  verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5819  if (CheckCompressedOops) {
5820    Label ok;
5821    testq(src, src);
5822    jcc(Assembler::notEqual, ok);
5823    STOP("null oop passed to encode_heap_oop_not_null2");
5824    bind(ok);
5825  }
5826#endif
5827  verify_oop(src, "broken oop in encode_heap_oop_not_null2");
5828  if (dst != src) {
5829    movq(dst, src);
5830  }
5831  if (Universe::narrow_oop_base() != NULL) {
5832    subq(dst, r12_heapbase);
5833  }
5834  if (Universe::narrow_oop_shift() != 0) {
5835    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5836    shrq(dst, LogMinObjAlignmentInBytes);
5837  }
5838}
5839
5840void  MacroAssembler::decode_heap_oop(Register r) {
5841#ifdef ASSERT
5842  verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5843#endif
5844  if (Universe::narrow_oop_base() == NULL) {
5845    if (Universe::narrow_oop_shift() != 0) {
5846      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5847      shlq(r, LogMinObjAlignmentInBytes);
5848    }
5849  } else {
5850    Label done;
5851    shlq(r, LogMinObjAlignmentInBytes);
5852    jccb(Assembler::equal, done);
5853    addq(r, r12_heapbase);
5854    bind(done);
5855  }
5856  verify_oop(r, "broken oop in decode_heap_oop");
5857}
5858
5859void  MacroAssembler::decode_heap_oop_not_null(Register r) {
5860  // Note: it will change flags
5861  assert (UseCompressedOops, "should only be used for compressed headers");
5862  assert (Universe::heap() != NULL, "java heap should be initialized");
5863  // Cannot assert, unverified entry point counts instructions (see .ad file)
5864  // vtableStubs also counts instructions in pd_code_size_limit.
5865  // Also do not verify_oop as this is called by verify_oop.
5866  if (Universe::narrow_oop_shift() != 0) {
5867    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5868    shlq(r, LogMinObjAlignmentInBytes);
5869    if (Universe::narrow_oop_base() != NULL) {
5870      addq(r, r12_heapbase);
5871    }
5872  } else {
5873    assert (Universe::narrow_oop_base() == NULL, "sanity");
5874  }
5875}
5876
5877void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5878  // Note: it will change flags
5879  assert (UseCompressedOops, "should only be used for compressed headers");
5880  assert (Universe::heap() != NULL, "java heap should be initialized");
5881  // Cannot assert, unverified entry point counts instructions (see .ad file)
5882  // vtableStubs also counts instructions in pd_code_size_limit.
5883  // Also do not verify_oop as this is called by verify_oop.
5884  if (Universe::narrow_oop_shift() != 0) {
5885    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5886    if (LogMinObjAlignmentInBytes == Address::times_8) {
5887      leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5888    } else {
5889      if (dst != src) {
5890        movq(dst, src);
5891      }
5892      shlq(dst, LogMinObjAlignmentInBytes);
5893      if (Universe::narrow_oop_base() != NULL) {
5894        addq(dst, r12_heapbase);
5895      }
5896    }
5897  } else {
5898    assert (Universe::narrow_oop_base() == NULL, "sanity");
5899    if (dst != src) {
5900      movq(dst, src);
5901    }
5902  }
5903}
5904
5905void MacroAssembler::encode_klass_not_null(Register r) {
5906  if (Universe::narrow_klass_base() != NULL) {
5907    // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5908    assert(r != r12_heapbase, "Encoding a klass in r12");
5909    mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
5910    subq(r, r12_heapbase);
5911  }
5912  if (Universe::narrow_klass_shift() != 0) {
5913    assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5914    shrq(r, LogKlassAlignmentInBytes);
5915  }
5916  if (Universe::narrow_klass_base() != NULL) {
5917    reinit_heapbase();
5918  }
5919}
5920
5921void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
5922  if (dst == src) {
5923    encode_klass_not_null(src);
5924  } else {
5925    if (Universe::narrow_klass_base() != NULL) {
5926      mov64(dst, (int64_t)Universe::narrow_klass_base());
5927      negq(dst);
5928      addq(dst, src);
5929    } else {
5930      movptr(dst, src);
5931    }
5932    if (Universe::narrow_klass_shift() != 0) {
5933      assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5934      shrq(dst, LogKlassAlignmentInBytes);
5935    }
5936  }
5937}
5938
5939// Function instr_size_for_decode_klass_not_null() counts the instructions
5940// generated by decode_klass_not_null(register r) and reinit_heapbase(),
5941// when (Universe::heap() != NULL).  Hence, if the instructions they
5942// generate change, then this method needs to be updated.
5943int MacroAssembler::instr_size_for_decode_klass_not_null() {
5944  assert (UseCompressedClassPointers, "only for compressed klass ptrs");
5945  if (Universe::narrow_klass_base() != NULL) {
5946    // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
5947    return (Universe::narrow_klass_shift() == 0 ? 20 : 24);
5948  } else {
5949    // longest load decode klass function, mov64, leaq
5950    return 16;
5951  }
5952}
5953
5954// !!! If the instructions that get generated here change then function
5955// instr_size_for_decode_klass_not_null() needs to get updated.
5956void  MacroAssembler::decode_klass_not_null(Register r) {
5957  // Note: it will change flags
5958  assert (UseCompressedClassPointers, "should only be used for compressed headers");
5959  assert(r != r12_heapbase, "Decoding a klass in r12");
5960  // Cannot assert, unverified entry point counts instructions (see .ad file)
5961  // vtableStubs also counts instructions in pd_code_size_limit.
5962  // Also do not verify_oop as this is called by verify_oop.
5963  if (Universe::narrow_klass_shift() != 0) {
5964    assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5965    shlq(r, LogKlassAlignmentInBytes);
5966  }
5967  // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5968  if (Universe::narrow_klass_base() != NULL) {
5969    mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
5970    addq(r, r12_heapbase);
5971    reinit_heapbase();
5972  }
5973}
5974
5975void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
5976  // Note: it will change flags
5977  assert (UseCompressedClassPointers, "should only be used for compressed headers");
5978  if (dst == src) {
5979    decode_klass_not_null(dst);
5980  } else {
5981    // Cannot assert, unverified entry point counts instructions (see .ad file)
5982    // vtableStubs also counts instructions in pd_code_size_limit.
5983    // Also do not verify_oop as this is called by verify_oop.
5984    mov64(dst, (int64_t)Universe::narrow_klass_base());
5985    if (Universe::narrow_klass_shift() != 0) {
5986      assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5987      assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5988      leaq(dst, Address(dst, src, Address::times_8, 0));
5989    } else {
5990      addq(dst, src);
5991    }
5992  }
5993}
5994
5995void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5996  assert (UseCompressedOops, "should only be used for compressed headers");
5997  assert (Universe::heap() != NULL, "java heap should be initialized");
5998  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5999  int oop_index = oop_recorder()->find_index(obj);
6000  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6001  mov_narrow_oop(dst, oop_index, rspec);
6002}
6003
6004void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
6005  assert (UseCompressedOops, "should only be used for compressed headers");
6006  assert (Universe::heap() != NULL, "java heap should be initialized");
6007  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6008  int oop_index = oop_recorder()->find_index(obj);
6009  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6010  mov_narrow_oop(dst, oop_index, rspec);
6011}
6012
6013void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
6014  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6015  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6016  int klass_index = oop_recorder()->find_index(k);
6017  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6018  mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
6019}
6020
6021void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
6022  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6023  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6024  int klass_index = oop_recorder()->find_index(k);
6025  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6026  mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
6027}
6028
6029void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
6030  assert (UseCompressedOops, "should only be used for compressed headers");
6031  assert (Universe::heap() != NULL, "java heap should be initialized");
6032  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6033  int oop_index = oop_recorder()->find_index(obj);
6034  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6035  Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6036}
6037
6038void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
6039  assert (UseCompressedOops, "should only be used for compressed headers");
6040  assert (Universe::heap() != NULL, "java heap should be initialized");
6041  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6042  int oop_index = oop_recorder()->find_index(obj);
6043  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6044  Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6045}
6046
6047void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
6048  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6049  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6050  int klass_index = oop_recorder()->find_index(k);
6051  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6052  Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
6053}
6054
6055void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
6056  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6057  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6058  int klass_index = oop_recorder()->find_index(k);
6059  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6060  Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
6061}
6062
6063void MacroAssembler::reinit_heapbase() {
6064  if (UseCompressedOops || UseCompressedClassPointers) {
6065    if (Universe::heap() != NULL) {
6066      if (Universe::narrow_oop_base() == NULL) {
6067        MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
6068      } else {
6069        mov64(r12_heapbase, (int64_t)Universe::narrow_ptrs_base());
6070      }
6071    } else {
6072      movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
6073    }
6074  }
6075}
6076
6077#endif // _LP64
6078
6079
6080// C2 compiled method's prolog code.
6081void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b) {
6082
6083  // WARNING: Initial instruction MUST be 5 bytes or longer so that
6084  // NativeJump::patch_verified_entry will be able to patch out the entry
6085  // code safely. The push to verify stack depth is ok at 5 bytes,
6086  // the frame allocation can be either 3 or 6 bytes. So if we don't do
6087  // stack bang then we must use the 6 byte frame allocation even if
6088  // we have no frame. :-(
6089  assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
6090
6091  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
6092  // Remove word for return addr
6093  framesize -= wordSize;
6094  stack_bang_size -= wordSize;
6095
6096  // Calls to C2R adapters often do not accept exceptional returns.
6097  // We require that their callers must bang for them.  But be careful, because
6098  // some VM calls (such as call site linkage) can use several kilobytes of
6099  // stack.  But the stack safety zone should account for that.
6100  // See bugs 4446381, 4468289, 4497237.
6101  if (stack_bang_size > 0) {
6102    generate_stack_overflow_check(stack_bang_size);
6103
6104    // We always push rbp, so that on return to interpreter rbp, will be
6105    // restored correctly and we can correct the stack.
6106    push(rbp);
6107    // Remove word for ebp
6108    framesize -= wordSize;
6109
6110    // Create frame
6111    if (framesize) {
6112      subptr(rsp, framesize);
6113    }
6114  } else {
6115    // Create frame (force generation of a 4 byte immediate value)
6116    subptr_imm32(rsp, framesize);
6117
6118    // Save RBP register now.
6119    framesize -= wordSize;
6120    movptr(Address(rsp, framesize), rbp);
6121  }
6122
6123  if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
6124    framesize -= wordSize;
6125    movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
6126  }
6127
6128#ifndef _LP64
6129  // If method sets FPU control word do it now
6130  if (fp_mode_24b) {
6131    fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
6132  }
6133  if (UseSSE >= 2 && VerifyFPU) {
6134    verify_FPU(0, "FPU stack must be clean on entry");
6135  }
6136#endif
6137
6138#ifdef ASSERT
6139  if (VerifyStackAtCalls) {
6140    Label L;
6141    push(rax);
6142    mov(rax, rsp);
6143    andptr(rax, StackAlignmentInBytes-1);
6144    cmpptr(rax, StackAlignmentInBytes-wordSize);
6145    pop(rax);
6146    jcc(Assembler::equal, L);
6147    STOP("Stack is not properly aligned!");
6148    bind(L);
6149  }
6150#endif
6151
6152}
6153
6154void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) {
6155  // cnt - number of qwords (8-byte words).
6156  // base - start address, qword aligned.
6157  assert(base==rdi, "base register must be edi for rep stos");
6158  assert(tmp==rax,   "tmp register must be eax for rep stos");
6159  assert(cnt==rcx,   "cnt register must be ecx for rep stos");
6160
6161  xorptr(tmp, tmp);
6162  if (UseFastStosb) {
6163    shlptr(cnt,3); // convert to number of bytes
6164    rep_stosb();
6165  } else {
6166    NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
6167    rep_stos();
6168  }
6169}
6170
6171// IndexOf for constant substrings with size >= 8 chars
6172// which don't need to be loaded through stack.
6173void MacroAssembler::string_indexofC8(Register str1, Register str2,
6174                                      Register cnt1, Register cnt2,
6175                                      int int_cnt2,  Register result,
6176                                      XMMRegister vec, Register tmp) {
6177  ShortBranchVerifier sbv(this);
6178  assert(UseSSE42Intrinsics, "SSE4.2 is required");
6179
6180  // This method uses pcmpestri inxtruction with bound registers
6181  //   inputs:
6182  //     xmm - substring
6183  //     rax - substring length (elements count)
6184  //     mem - scanned string
6185  //     rdx - string length (elements count)
6186  //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6187  //   outputs:
6188  //     rcx - matched index in string
6189  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6190
6191  Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
6192        RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
6193        MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
6194
6195  // Note, inline_string_indexOf() generates checks:
6196  // if (substr.count > string.count) return -1;
6197  // if (substr.count == 0) return 0;
6198  assert(int_cnt2 >= 8, "this code isused only for cnt2 >= 8 chars");
6199
6200  // Load substring.
6201  movdqu(vec, Address(str2, 0));
6202  movl(cnt2, int_cnt2);
6203  movptr(result, str1); // string addr
6204
6205  if (int_cnt2 > 8) {
6206    jmpb(SCAN_TO_SUBSTR);
6207
6208    // Reload substr for rescan, this code
6209    // is executed only for large substrings (> 8 chars)
6210    bind(RELOAD_SUBSTR);
6211    movdqu(vec, Address(str2, 0));
6212    negptr(cnt2); // Jumped here with negative cnt2, convert to positive
6213
6214    bind(RELOAD_STR);
6215    // We came here after the beginning of the substring was
6216    // matched but the rest of it was not so we need to search
6217    // again. Start from the next element after the previous match.
6218
6219    // cnt2 is number of substring reminding elements and
6220    // cnt1 is number of string reminding elements when cmp failed.
6221    // Restored cnt1 = cnt1 - cnt2 + int_cnt2
6222    subl(cnt1, cnt2);
6223    addl(cnt1, int_cnt2);
6224    movl(cnt2, int_cnt2); // Now restore cnt2
6225
6226    decrementl(cnt1);     // Shift to next element
6227    cmpl(cnt1, cnt2);
6228    jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6229
6230    addptr(result, 2);
6231
6232  } // (int_cnt2 > 8)
6233
6234  // Scan string for start of substr in 16-byte vectors
6235  bind(SCAN_TO_SUBSTR);
6236  pcmpestri(vec, Address(result, 0), 0x0d);
6237  jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6238  subl(cnt1, 8);
6239  jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6240  cmpl(cnt1, cnt2);
6241  jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6242  addptr(result, 16);
6243  jmpb(SCAN_TO_SUBSTR);
6244
6245  // Found a potential substr
6246  bind(FOUND_CANDIDATE);
6247  // Matched whole vector if first element matched (tmp(rcx) == 0).
6248  if (int_cnt2 == 8) {
6249    jccb(Assembler::overflow, RET_FOUND);    // OF == 1
6250  } else { // int_cnt2 > 8
6251    jccb(Assembler::overflow, FOUND_SUBSTR);
6252  }
6253  // After pcmpestri tmp(rcx) contains matched element index
6254  // Compute start addr of substr
6255  lea(result, Address(result, tmp, Address::times_2));
6256
6257  // Make sure string is still long enough
6258  subl(cnt1, tmp);
6259  cmpl(cnt1, cnt2);
6260  if (int_cnt2 == 8) {
6261    jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6262  } else { // int_cnt2 > 8
6263    jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
6264  }
6265  // Left less then substring.
6266
6267  bind(RET_NOT_FOUND);
6268  movl(result, -1);
6269  jmpb(EXIT);
6270
6271  if (int_cnt2 > 8) {
6272    // This code is optimized for the case when whole substring
6273    // is matched if its head is matched.
6274    bind(MATCH_SUBSTR_HEAD);
6275    pcmpestri(vec, Address(result, 0), 0x0d);
6276    // Reload only string if does not match
6277    jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0
6278
6279    Label CONT_SCAN_SUBSTR;
6280    // Compare the rest of substring (> 8 chars).
6281    bind(FOUND_SUBSTR);
6282    // First 8 chars are already matched.
6283    negptr(cnt2);
6284    addptr(cnt2, 8);
6285
6286    bind(SCAN_SUBSTR);
6287    subl(cnt1, 8);
6288    cmpl(cnt2, -8); // Do not read beyond substring
6289    jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
6290    // Back-up strings to avoid reading beyond substring:
6291    // cnt1 = cnt1 - cnt2 + 8
6292    addl(cnt1, cnt2); // cnt2 is negative
6293    addl(cnt1, 8);
6294    movl(cnt2, 8); negptr(cnt2);
6295    bind(CONT_SCAN_SUBSTR);
6296    if (int_cnt2 < (int)G) {
6297      movdqu(vec, Address(str2, cnt2, Address::times_2, int_cnt2*2));
6298      pcmpestri(vec, Address(result, cnt2, Address::times_2, int_cnt2*2), 0x0d);
6299    } else {
6300      // calculate index in register to avoid integer overflow (int_cnt2*2)
6301      movl(tmp, int_cnt2);
6302      addptr(tmp, cnt2);
6303      movdqu(vec, Address(str2, tmp, Address::times_2, 0));
6304      pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d);
6305    }
6306    // Need to reload strings pointers if not matched whole vector
6307    jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6308    addptr(cnt2, 8);
6309    jcc(Assembler::negative, SCAN_SUBSTR);
6310    // Fall through if found full substring
6311
6312  } // (int_cnt2 > 8)
6313
6314  bind(RET_FOUND);
6315  // Found result if we matched full small substring.
6316  // Compute substr offset
6317  subptr(result, str1);
6318  shrl(result, 1); // index
6319  bind(EXIT);
6320
6321} // string_indexofC8
6322
6323// Small strings are loaded through stack if they cross page boundary.
6324void MacroAssembler::string_indexof(Register str1, Register str2,
6325                                    Register cnt1, Register cnt2,
6326                                    int int_cnt2,  Register result,
6327                                    XMMRegister vec, Register tmp) {
6328  ShortBranchVerifier sbv(this);
6329  assert(UseSSE42Intrinsics, "SSE4.2 is required");
6330  //
6331  // int_cnt2 is length of small (< 8 chars) constant substring
6332  // or (-1) for non constant substring in which case its length
6333  // is in cnt2 register.
6334  //
6335  // Note, inline_string_indexOf() generates checks:
6336  // if (substr.count > string.count) return -1;
6337  // if (substr.count == 0) return 0;
6338  //
6339  assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < 8), "should be != 0");
6340
6341  // This method uses pcmpestri inxtruction with bound registers
6342  //   inputs:
6343  //     xmm - substring
6344  //     rax - substring length (elements count)
6345  //     mem - scanned string
6346  //     rdx - string length (elements count)
6347  //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6348  //   outputs:
6349  //     rcx - matched index in string
6350  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6351
6352  Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
6353        RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
6354        FOUND_CANDIDATE;
6355
6356  { //========================================================
6357    // We don't know where these strings are located
6358    // and we can't read beyond them. Load them through stack.
6359    Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
6360
6361    movptr(tmp, rsp); // save old SP
6362
6363    if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
6364      if (int_cnt2 == 1) {  // One char
6365        load_unsigned_short(result, Address(str2, 0));
6366        movdl(vec, result); // move 32 bits
6367      } else if (int_cnt2 == 2) { // Two chars
6368        movdl(vec, Address(str2, 0)); // move 32 bits
6369      } else if (int_cnt2 == 4) { // Four chars
6370        movq(vec, Address(str2, 0));  // move 64 bits
6371      } else { // cnt2 = { 3, 5, 6, 7 }
6372        // Array header size is 12 bytes in 32-bit VM
6373        // + 6 bytes for 3 chars == 18 bytes,
6374        // enough space to load vec and shift.
6375        assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
6376        movdqu(vec, Address(str2, (int_cnt2*2)-16));
6377        psrldq(vec, 16-(int_cnt2*2));
6378      }
6379    } else { // not constant substring
6380      cmpl(cnt2, 8);
6381      jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
6382
6383      // We can read beyond string if srt+16 does not cross page boundary
6384      // since heaps are aligned and mapped by pages.
6385      assert(os::vm_page_size() < (int)G, "default page should be small");
6386      movl(result, str2); // We need only low 32 bits
6387      andl(result, (os::vm_page_size()-1));
6388      cmpl(result, (os::vm_page_size()-16));
6389      jccb(Assembler::belowEqual, CHECK_STR);
6390
6391      // Move small strings to stack to allow load 16 bytes into vec.
6392      subptr(rsp, 16);
6393      int stk_offset = wordSize-2;
6394      push(cnt2);
6395
6396      bind(COPY_SUBSTR);
6397      load_unsigned_short(result, Address(str2, cnt2, Address::times_2, -2));
6398      movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
6399      decrement(cnt2);
6400      jccb(Assembler::notZero, COPY_SUBSTR);
6401
6402      pop(cnt2);
6403      movptr(str2, rsp);  // New substring address
6404    } // non constant
6405
6406    bind(CHECK_STR);
6407    cmpl(cnt1, 8);
6408    jccb(Assembler::aboveEqual, BIG_STRINGS);
6409
6410    // Check cross page boundary.
6411    movl(result, str1); // We need only low 32 bits
6412    andl(result, (os::vm_page_size()-1));
6413    cmpl(result, (os::vm_page_size()-16));
6414    jccb(Assembler::belowEqual, BIG_STRINGS);
6415
6416    subptr(rsp, 16);
6417    int stk_offset = -2;
6418    if (int_cnt2 < 0) { // not constant
6419      push(cnt2);
6420      stk_offset += wordSize;
6421    }
6422    movl(cnt2, cnt1);
6423
6424    bind(COPY_STR);
6425    load_unsigned_short(result, Address(str1, cnt2, Address::times_2, -2));
6426    movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
6427    decrement(cnt2);
6428    jccb(Assembler::notZero, COPY_STR);
6429
6430    if (int_cnt2 < 0) { // not constant
6431      pop(cnt2);
6432    }
6433    movptr(str1, rsp);  // New string address
6434
6435    bind(BIG_STRINGS);
6436    // Load substring.
6437    if (int_cnt2 < 0) { // -1
6438      movdqu(vec, Address(str2, 0));
6439      push(cnt2);       // substr count
6440      push(str2);       // substr addr
6441      push(str1);       // string addr
6442    } else {
6443      // Small (< 8 chars) constant substrings are loaded already.
6444      movl(cnt2, int_cnt2);
6445    }
6446    push(tmp);  // original SP
6447
6448  } // Finished loading
6449
6450  //========================================================
6451  // Start search
6452  //
6453
6454  movptr(result, str1); // string addr
6455
6456  if (int_cnt2  < 0) {  // Only for non constant substring
6457    jmpb(SCAN_TO_SUBSTR);
6458
6459    // SP saved at sp+0
6460    // String saved at sp+1*wordSize
6461    // Substr saved at sp+2*wordSize
6462    // Substr count saved at sp+3*wordSize
6463
6464    // Reload substr for rescan, this code
6465    // is executed only for large substrings (> 8 chars)
6466    bind(RELOAD_SUBSTR);
6467    movptr(str2, Address(rsp, 2*wordSize));
6468    movl(cnt2, Address(rsp, 3*wordSize));
6469    movdqu(vec, Address(str2, 0));
6470    // We came here after the beginning of the substring was
6471    // matched but the rest of it was not so we need to search
6472    // again. Start from the next element after the previous match.
6473    subptr(str1, result); // Restore counter
6474    shrl(str1, 1);
6475    addl(cnt1, str1);
6476    decrementl(cnt1);   // Shift to next element
6477    cmpl(cnt1, cnt2);
6478    jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6479
6480    addptr(result, 2);
6481  } // non constant
6482
6483  // Scan string for start of substr in 16-byte vectors
6484  bind(SCAN_TO_SUBSTR);
6485  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6486  pcmpestri(vec, Address(result, 0), 0x0d);
6487  jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6488  subl(cnt1, 8);
6489  jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6490  cmpl(cnt1, cnt2);
6491  jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6492  addptr(result, 16);
6493
6494  bind(ADJUST_STR);
6495  cmpl(cnt1, 8); // Do not read beyond string
6496  jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6497  // Back-up string to avoid reading beyond string.
6498  lea(result, Address(result, cnt1, Address::times_2, -16));
6499  movl(cnt1, 8);
6500  jmpb(SCAN_TO_SUBSTR);
6501
6502  // Found a potential substr
6503  bind(FOUND_CANDIDATE);
6504  // After pcmpestri tmp(rcx) contains matched element index
6505
6506  // Make sure string is still long enough
6507  subl(cnt1, tmp);
6508  cmpl(cnt1, cnt2);
6509  jccb(Assembler::greaterEqual, FOUND_SUBSTR);
6510  // Left less then substring.
6511
6512  bind(RET_NOT_FOUND);
6513  movl(result, -1);
6514  jmpb(CLEANUP);
6515
6516  bind(FOUND_SUBSTR);
6517  // Compute start addr of substr
6518  lea(result, Address(result, tmp, Address::times_2));
6519
6520  if (int_cnt2 > 0) { // Constant substring
6521    // Repeat search for small substring (< 8 chars)
6522    // from new point without reloading substring.
6523    // Have to check that we don't read beyond string.
6524    cmpl(tmp, 8-int_cnt2);
6525    jccb(Assembler::greater, ADJUST_STR);
6526    // Fall through if matched whole substring.
6527  } else { // non constant
6528    assert(int_cnt2 == -1, "should be != 0");
6529
6530    addl(tmp, cnt2);
6531    // Found result if we matched whole substring.
6532    cmpl(tmp, 8);
6533    jccb(Assembler::lessEqual, RET_FOUND);
6534
6535    // Repeat search for small substring (<= 8 chars)
6536    // from new point 'str1' without reloading substring.
6537    cmpl(cnt2, 8);
6538    // Have to check that we don't read beyond string.
6539    jccb(Assembler::lessEqual, ADJUST_STR);
6540
6541    Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
6542    // Compare the rest of substring (> 8 chars).
6543    movptr(str1, result);
6544
6545    cmpl(tmp, cnt2);
6546    // First 8 chars are already matched.
6547    jccb(Assembler::equal, CHECK_NEXT);
6548
6549    bind(SCAN_SUBSTR);
6550    pcmpestri(vec, Address(str1, 0), 0x0d);
6551    // Need to reload strings pointers if not matched whole vector
6552    jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6553
6554    bind(CHECK_NEXT);
6555    subl(cnt2, 8);
6556    jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
6557    addptr(str1, 16);
6558    addptr(str2, 16);
6559    subl(cnt1, 8);
6560    cmpl(cnt2, 8); // Do not read beyond substring
6561    jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
6562    // Back-up strings to avoid reading beyond substring.
6563    lea(str2, Address(str2, cnt2, Address::times_2, -16));
6564    lea(str1, Address(str1, cnt2, Address::times_2, -16));
6565    subl(cnt1, cnt2);
6566    movl(cnt2, 8);
6567    addl(cnt1, 8);
6568    bind(CONT_SCAN_SUBSTR);
6569    movdqu(vec, Address(str2, 0));
6570    jmpb(SCAN_SUBSTR);
6571
6572    bind(RET_FOUND_LONG);
6573    movptr(str1, Address(rsp, wordSize));
6574  } // non constant
6575
6576  bind(RET_FOUND);
6577  // Compute substr offset
6578  subptr(result, str1);
6579  shrl(result, 1); // index
6580
6581  bind(CLEANUP);
6582  pop(rsp); // restore SP
6583
6584} // string_indexof
6585
6586// Compare strings.
6587void MacroAssembler::string_compare(Register str1, Register str2,
6588                                    Register cnt1, Register cnt2, Register result,
6589                                    XMMRegister vec1) {
6590  ShortBranchVerifier sbv(this);
6591  Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
6592
6593  // Compute the minimum of the string lengths and the
6594  // difference of the string lengths (stack).
6595  // Do the conditional move stuff
6596  movl(result, cnt1);
6597  subl(cnt1, cnt2);
6598  push(cnt1);
6599  cmov32(Assembler::lessEqual, cnt2, result);
6600
6601  // Is the minimum length zero?
6602  testl(cnt2, cnt2);
6603  jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6604
6605  // Compare first characters
6606  load_unsigned_short(result, Address(str1, 0));
6607  load_unsigned_short(cnt1, Address(str2, 0));
6608  subl(result, cnt1);
6609  jcc(Assembler::notZero,  POP_LABEL);
6610  cmpl(cnt2, 1);
6611  jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6612
6613  // Check if the strings start at the same location.
6614  cmpptr(str1, str2);
6615  jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6616
6617  Address::ScaleFactor scale = Address::times_2;
6618  int stride = 8;
6619
6620  if (UseAVX >= 2 && UseSSE42Intrinsics) {
6621    Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
6622    Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
6623    Label COMPARE_TAIL_LONG;
6624    int pcmpmask = 0x19;
6625
6626    // Setup to compare 16-chars (32-bytes) vectors,
6627    // start from first character again because it has aligned address.
6628    int stride2 = 16;
6629    int adr_stride  = stride  << scale;
6630    int adr_stride2 = stride2 << scale;
6631
6632    assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6633    // rax and rdx are used by pcmpestri as elements counters
6634    movl(result, cnt2);
6635    andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
6636    jcc(Assembler::zero, COMPARE_TAIL_LONG);
6637
6638    // fast path : compare first 2 8-char vectors.
6639    bind(COMPARE_16_CHARS);
6640    movdqu(vec1, Address(str1, 0));
6641    pcmpestri(vec1, Address(str2, 0), pcmpmask);
6642    jccb(Assembler::below, COMPARE_INDEX_CHAR);
6643
6644    movdqu(vec1, Address(str1, adr_stride));
6645    pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
6646    jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
6647    addl(cnt1, stride);
6648
6649    // Compare the characters at index in cnt1
6650    bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character
6651    load_unsigned_short(result, Address(str1, cnt1, scale));
6652    load_unsigned_short(cnt2, Address(str2, cnt1, scale));
6653    subl(result, cnt2);
6654    jmp(POP_LABEL);
6655
6656    // Setup the registers to start vector comparison loop
6657    bind(COMPARE_WIDE_VECTORS);
6658    lea(str1, Address(str1, result, scale));
6659    lea(str2, Address(str2, result, scale));
6660    subl(result, stride2);
6661    subl(cnt2, stride2);
6662    jccb(Assembler::zero, COMPARE_WIDE_TAIL);
6663    negptr(result);
6664
6665    //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
6666    bind(COMPARE_WIDE_VECTORS_LOOP);
6667    vmovdqu(vec1, Address(str1, result, scale));
6668    vpxor(vec1, Address(str2, result, scale));
6669    vptest(vec1, vec1);
6670    jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
6671    addptr(result, stride2);
6672    subl(cnt2, stride2);
6673    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
6674    // clean upper bits of YMM registers
6675    vzeroupper();
6676
6677    // compare wide vectors tail
6678    bind(COMPARE_WIDE_TAIL);
6679    testptr(result, result);
6680    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6681
6682    movl(result, stride2);
6683    movl(cnt2, result);
6684    negptr(result);
6685    jmpb(COMPARE_WIDE_VECTORS_LOOP);
6686
6687    // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
6688    bind(VECTOR_NOT_EQUAL);
6689    // clean upper bits of YMM registers
6690    vzeroupper();
6691    lea(str1, Address(str1, result, scale));
6692    lea(str2, Address(str2, result, scale));
6693    jmp(COMPARE_16_CHARS);
6694
6695    // Compare tail chars, length between 1 to 15 chars
6696    bind(COMPARE_TAIL_LONG);
6697    movl(cnt2, result);
6698    cmpl(cnt2, stride);
6699    jccb(Assembler::less, COMPARE_SMALL_STR);
6700
6701    movdqu(vec1, Address(str1, 0));
6702    pcmpestri(vec1, Address(str2, 0), pcmpmask);
6703    jcc(Assembler::below, COMPARE_INDEX_CHAR);
6704    subptr(cnt2, stride);
6705    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6706    lea(str1, Address(str1, result, scale));
6707    lea(str2, Address(str2, result, scale));
6708    negptr(cnt2);
6709    jmpb(WHILE_HEAD_LABEL);
6710
6711    bind(COMPARE_SMALL_STR);
6712  } else if (UseSSE42Intrinsics) {
6713    Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
6714    int pcmpmask = 0x19;
6715    // Setup to compare 8-char (16-byte) vectors,
6716    // start from first character again because it has aligned address.
6717    movl(result, cnt2);
6718    andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
6719    jccb(Assembler::zero, COMPARE_TAIL);
6720
6721    lea(str1, Address(str1, result, scale));
6722    lea(str2, Address(str2, result, scale));
6723    negptr(result);
6724
6725    // pcmpestri
6726    //   inputs:
6727    //     vec1- substring
6728    //     rax - negative string length (elements count)
6729    //     mem - scaned string
6730    //     rdx - string length (elements count)
6731    //     pcmpmask - cmp mode: 11000 (string compare with negated result)
6732    //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
6733    //   outputs:
6734    //     rcx - first mismatched element index
6735    assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6736
6737    bind(COMPARE_WIDE_VECTORS);
6738    movdqu(vec1, Address(str1, result, scale));
6739    pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6740    // After pcmpestri cnt1(rcx) contains mismatched element index
6741
6742    jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
6743    addptr(result, stride);
6744    subptr(cnt2, stride);
6745    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
6746
6747    // compare wide vectors tail
6748    testptr(result, result);
6749    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6750
6751    movl(cnt2, stride);
6752    movl(result, stride);
6753    negptr(result);
6754    movdqu(vec1, Address(str1, result, scale));
6755    pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6756    jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
6757
6758    // Mismatched characters in the vectors
6759    bind(VECTOR_NOT_EQUAL);
6760    addptr(cnt1, result);
6761    load_unsigned_short(result, Address(str1, cnt1, scale));
6762    load_unsigned_short(cnt2, Address(str2, cnt1, scale));
6763    subl(result, cnt2);
6764    jmpb(POP_LABEL);
6765
6766    bind(COMPARE_TAIL); // limit is zero
6767    movl(cnt2, result);
6768    // Fallthru to tail compare
6769  }
6770  // Shift str2 and str1 to the end of the arrays, negate min
6771  lea(str1, Address(str1, cnt2, scale));
6772  lea(str2, Address(str2, cnt2, scale));
6773  decrementl(cnt2);  // first character was compared already
6774  negptr(cnt2);
6775
6776  // Compare the rest of the elements
6777  bind(WHILE_HEAD_LABEL);
6778  load_unsigned_short(result, Address(str1, cnt2, scale, 0));
6779  load_unsigned_short(cnt1, Address(str2, cnt2, scale, 0));
6780  subl(result, cnt1);
6781  jccb(Assembler::notZero, POP_LABEL);
6782  increment(cnt2);
6783  jccb(Assembler::notZero, WHILE_HEAD_LABEL);
6784
6785  // Strings are equal up to min length.  Return the length difference.
6786  bind(LENGTH_DIFF_LABEL);
6787  pop(result);
6788  jmpb(DONE_LABEL);
6789
6790  // Discard the stored length difference
6791  bind(POP_LABEL);
6792  pop(cnt1);
6793
6794  // That's it
6795  bind(DONE_LABEL);
6796}
6797
6798// Compare char[] arrays aligned to 4 bytes or substrings.
6799void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2,
6800                                        Register limit, Register result, Register chr,
6801                                        XMMRegister vec1, XMMRegister vec2) {
6802  ShortBranchVerifier sbv(this);
6803  Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
6804
6805  int length_offset  = arrayOopDesc::length_offset_in_bytes();
6806  int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
6807
6808  // Check the input args
6809  cmpptr(ary1, ary2);
6810  jcc(Assembler::equal, TRUE_LABEL);
6811
6812  if (is_array_equ) {
6813    // Need additional checks for arrays_equals.
6814    testptr(ary1, ary1);
6815    jcc(Assembler::zero, FALSE_LABEL);
6816    testptr(ary2, ary2);
6817    jcc(Assembler::zero, FALSE_LABEL);
6818
6819    // Check the lengths
6820    movl(limit, Address(ary1, length_offset));
6821    cmpl(limit, Address(ary2, length_offset));
6822    jcc(Assembler::notEqual, FALSE_LABEL);
6823  }
6824
6825  // count == 0
6826  testl(limit, limit);
6827  jcc(Assembler::zero, TRUE_LABEL);
6828
6829  if (is_array_equ) {
6830    // Load array address
6831    lea(ary1, Address(ary1, base_offset));
6832    lea(ary2, Address(ary2, base_offset));
6833  }
6834
6835  shll(limit, 1);      // byte count != 0
6836  movl(result, limit); // copy
6837
6838  if (UseAVX >= 2) {
6839    // With AVX2, use 32-byte vector compare
6840    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6841
6842    // Compare 32-byte vectors
6843    andl(result, 0x0000001e);  //   tail count (in bytes)
6844    andl(limit, 0xffffffe0);   // vector count (in bytes)
6845    jccb(Assembler::zero, COMPARE_TAIL);
6846
6847    lea(ary1, Address(ary1, limit, Address::times_1));
6848    lea(ary2, Address(ary2, limit, Address::times_1));
6849    negptr(limit);
6850
6851    bind(COMPARE_WIDE_VECTORS);
6852    vmovdqu(vec1, Address(ary1, limit, Address::times_1));
6853    vmovdqu(vec2, Address(ary2, limit, Address::times_1));
6854    vpxor(vec1, vec2);
6855
6856    vptest(vec1, vec1);
6857    jccb(Assembler::notZero, FALSE_LABEL);
6858    addptr(limit, 32);
6859    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6860
6861    testl(result, result);
6862    jccb(Assembler::zero, TRUE_LABEL);
6863
6864    vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
6865    vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
6866    vpxor(vec1, vec2);
6867
6868    vptest(vec1, vec1);
6869    jccb(Assembler::notZero, FALSE_LABEL);
6870    jmpb(TRUE_LABEL);
6871
6872    bind(COMPARE_TAIL); // limit is zero
6873    movl(limit, result);
6874    // Fallthru to tail compare
6875  } else if (UseSSE42Intrinsics) {
6876    // With SSE4.2, use double quad vector compare
6877    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6878
6879    // Compare 16-byte vectors
6880    andl(result, 0x0000000e);  //   tail count (in bytes)
6881    andl(limit, 0xfffffff0);   // vector count (in bytes)
6882    jccb(Assembler::zero, COMPARE_TAIL);
6883
6884    lea(ary1, Address(ary1, limit, Address::times_1));
6885    lea(ary2, Address(ary2, limit, Address::times_1));
6886    negptr(limit);
6887
6888    bind(COMPARE_WIDE_VECTORS);
6889    movdqu(vec1, Address(ary1, limit, Address::times_1));
6890    movdqu(vec2, Address(ary2, limit, Address::times_1));
6891    pxor(vec1, vec2);
6892
6893    ptest(vec1, vec1);
6894    jccb(Assembler::notZero, FALSE_LABEL);
6895    addptr(limit, 16);
6896    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6897
6898    testl(result, result);
6899    jccb(Assembler::zero, TRUE_LABEL);
6900
6901    movdqu(vec1, Address(ary1, result, Address::times_1, -16));
6902    movdqu(vec2, Address(ary2, result, Address::times_1, -16));
6903    pxor(vec1, vec2);
6904
6905    ptest(vec1, vec1);
6906    jccb(Assembler::notZero, FALSE_LABEL);
6907    jmpb(TRUE_LABEL);
6908
6909    bind(COMPARE_TAIL); // limit is zero
6910    movl(limit, result);
6911    // Fallthru to tail compare
6912  }
6913
6914  // Compare 4-byte vectors
6915  andl(limit, 0xfffffffc); // vector count (in bytes)
6916  jccb(Assembler::zero, COMPARE_CHAR);
6917
6918  lea(ary1, Address(ary1, limit, Address::times_1));
6919  lea(ary2, Address(ary2, limit, Address::times_1));
6920  negptr(limit);
6921
6922  bind(COMPARE_VECTORS);
6923  movl(chr, Address(ary1, limit, Address::times_1));
6924  cmpl(chr, Address(ary2, limit, Address::times_1));
6925  jccb(Assembler::notEqual, FALSE_LABEL);
6926  addptr(limit, 4);
6927  jcc(Assembler::notZero, COMPARE_VECTORS);
6928
6929  // Compare trailing char (final 2 bytes), if any
6930  bind(COMPARE_CHAR);
6931  testl(result, 0x2);   // tail  char
6932  jccb(Assembler::zero, TRUE_LABEL);
6933  load_unsigned_short(chr, Address(ary1, 0));
6934  load_unsigned_short(limit, Address(ary2, 0));
6935  cmpl(chr, limit);
6936  jccb(Assembler::notEqual, FALSE_LABEL);
6937
6938  bind(TRUE_LABEL);
6939  movl(result, 1);   // return true
6940  jmpb(DONE);
6941
6942  bind(FALSE_LABEL);
6943  xorl(result, result); // return false
6944
6945  // That's it
6946  bind(DONE);
6947  if (UseAVX >= 2) {
6948    // clean upper bits of YMM registers
6949    vzeroupper();
6950  }
6951}
6952
6953void MacroAssembler::generate_fill(BasicType t, bool aligned,
6954                                   Register to, Register value, Register count,
6955                                   Register rtmp, XMMRegister xtmp) {
6956  ShortBranchVerifier sbv(this);
6957  assert_different_registers(to, value, count, rtmp);
6958  Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
6959  Label L_fill_2_bytes, L_fill_4_bytes;
6960
6961  int shift = -1;
6962  switch (t) {
6963    case T_BYTE:
6964      shift = 2;
6965      break;
6966    case T_SHORT:
6967      shift = 1;
6968      break;
6969    case T_INT:
6970      shift = 0;
6971      break;
6972    default: ShouldNotReachHere();
6973  }
6974
6975  if (t == T_BYTE) {
6976    andl(value, 0xff);
6977    movl(rtmp, value);
6978    shll(rtmp, 8);
6979    orl(value, rtmp);
6980  }
6981  if (t == T_SHORT) {
6982    andl(value, 0xffff);
6983  }
6984  if (t == T_BYTE || t == T_SHORT) {
6985    movl(rtmp, value);
6986    shll(rtmp, 16);
6987    orl(value, rtmp);
6988  }
6989
6990  cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
6991  jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
6992  if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
6993    // align source address at 4 bytes address boundary
6994    if (t == T_BYTE) {
6995      // One byte misalignment happens only for byte arrays
6996      testptr(to, 1);
6997      jccb(Assembler::zero, L_skip_align1);
6998      movb(Address(to, 0), value);
6999      increment(to);
7000      decrement(count);
7001      BIND(L_skip_align1);
7002    }
7003    // Two bytes misalignment happens only for byte and short (char) arrays
7004    testptr(to, 2);
7005    jccb(Assembler::zero, L_skip_align2);
7006    movw(Address(to, 0), value);
7007    addptr(to, 2);
7008    subl(count, 1<<(shift-1));
7009    BIND(L_skip_align2);
7010  }
7011  if (UseSSE < 2) {
7012    Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7013    // Fill 32-byte chunks
7014    subl(count, 8 << shift);
7015    jcc(Assembler::less, L_check_fill_8_bytes);
7016    align(16);
7017
7018    BIND(L_fill_32_bytes_loop);
7019
7020    for (int i = 0; i < 32; i += 4) {
7021      movl(Address(to, i), value);
7022    }
7023
7024    addptr(to, 32);
7025    subl(count, 8 << shift);
7026    jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7027    BIND(L_check_fill_8_bytes);
7028    addl(count, 8 << shift);
7029    jccb(Assembler::zero, L_exit);
7030    jmpb(L_fill_8_bytes);
7031
7032    //
7033    // length is too short, just fill qwords
7034    //
7035    BIND(L_fill_8_bytes_loop);
7036    movl(Address(to, 0), value);
7037    movl(Address(to, 4), value);
7038    addptr(to, 8);
7039    BIND(L_fill_8_bytes);
7040    subl(count, 1 << (shift + 1));
7041    jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7042    // fall through to fill 4 bytes
7043  } else {
7044    Label L_fill_32_bytes;
7045    if (!UseUnalignedLoadStores) {
7046      // align to 8 bytes, we know we are 4 byte aligned to start
7047      testptr(to, 4);
7048      jccb(Assembler::zero, L_fill_32_bytes);
7049      movl(Address(to, 0), value);
7050      addptr(to, 4);
7051      subl(count, 1<<shift);
7052    }
7053    BIND(L_fill_32_bytes);
7054    {
7055      assert( UseSSE >= 2, "supported cpu only" );
7056      Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7057      movdl(xtmp, value);
7058      if (UseAVX >= 2 && UseUnalignedLoadStores) {
7059        // Fill 64-byte chunks
7060        Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7061        vpbroadcastd(xtmp, xtmp);
7062
7063        subl(count, 16 << shift);
7064        jcc(Assembler::less, L_check_fill_32_bytes);
7065        align(16);
7066
7067        BIND(L_fill_64_bytes_loop);
7068        vmovdqu(Address(to, 0), xtmp);
7069        vmovdqu(Address(to, 32), xtmp);
7070        addptr(to, 64);
7071        subl(count, 16 << shift);
7072        jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7073
7074        BIND(L_check_fill_32_bytes);
7075        addl(count, 8 << shift);
7076        jccb(Assembler::less, L_check_fill_8_bytes);
7077        vmovdqu(Address(to, 0), xtmp);
7078        addptr(to, 32);
7079        subl(count, 8 << shift);
7080
7081        BIND(L_check_fill_8_bytes);
7082        // clean upper bits of YMM registers
7083        vzeroupper();
7084      } else {
7085        // Fill 32-byte chunks
7086        pshufd(xtmp, xtmp, 0);
7087
7088        subl(count, 8 << shift);
7089        jcc(Assembler::less, L_check_fill_8_bytes);
7090        align(16);
7091
7092        BIND(L_fill_32_bytes_loop);
7093
7094        if (UseUnalignedLoadStores) {
7095          movdqu(Address(to, 0), xtmp);
7096          movdqu(Address(to, 16), xtmp);
7097        } else {
7098          movq(Address(to, 0), xtmp);
7099          movq(Address(to, 8), xtmp);
7100          movq(Address(to, 16), xtmp);
7101          movq(Address(to, 24), xtmp);
7102        }
7103
7104        addptr(to, 32);
7105        subl(count, 8 << shift);
7106        jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7107
7108        BIND(L_check_fill_8_bytes);
7109      }
7110      addl(count, 8 << shift);
7111      jccb(Assembler::zero, L_exit);
7112      jmpb(L_fill_8_bytes);
7113
7114      //
7115      // length is too short, just fill qwords
7116      //
7117      BIND(L_fill_8_bytes_loop);
7118      movq(Address(to, 0), xtmp);
7119      addptr(to, 8);
7120      BIND(L_fill_8_bytes);
7121      subl(count, 1 << (shift + 1));
7122      jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7123    }
7124  }
7125  // fill trailing 4 bytes
7126  BIND(L_fill_4_bytes);
7127  testl(count, 1<<shift);
7128  jccb(Assembler::zero, L_fill_2_bytes);
7129  movl(Address(to, 0), value);
7130  if (t == T_BYTE || t == T_SHORT) {
7131    addptr(to, 4);
7132    BIND(L_fill_2_bytes);
7133    // fill trailing 2 bytes
7134    testl(count, 1<<(shift-1));
7135    jccb(Assembler::zero, L_fill_byte);
7136    movw(Address(to, 0), value);
7137    if (t == T_BYTE) {
7138      addptr(to, 2);
7139      BIND(L_fill_byte);
7140      // fill trailing byte
7141      testl(count, 1);
7142      jccb(Assembler::zero, L_exit);
7143      movb(Address(to, 0), value);
7144    } else {
7145      BIND(L_fill_byte);
7146    }
7147  } else {
7148    BIND(L_fill_2_bytes);
7149  }
7150  BIND(L_exit);
7151}
7152
7153// encode char[] to byte[] in ISO_8859_1
7154void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
7155                                      XMMRegister tmp1Reg, XMMRegister tmp2Reg,
7156                                      XMMRegister tmp3Reg, XMMRegister tmp4Reg,
7157                                      Register tmp5, Register result) {
7158  // rsi: src
7159  // rdi: dst
7160  // rdx: len
7161  // rcx: tmp5
7162  // rax: result
7163  ShortBranchVerifier sbv(this);
7164  assert_different_registers(src, dst, len, tmp5, result);
7165  Label L_done, L_copy_1_char, L_copy_1_char_exit;
7166
7167  // set result
7168  xorl(result, result);
7169  // check for zero length
7170  testl(len, len);
7171  jcc(Assembler::zero, L_done);
7172  movl(result, len);
7173
7174  // Setup pointers
7175  lea(src, Address(src, len, Address::times_2)); // char[]
7176  lea(dst, Address(dst, len, Address::times_1)); // byte[]
7177  negptr(len);
7178
7179  if (UseSSE42Intrinsics || UseAVX >= 2) {
7180    Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
7181    Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7182
7183    if (UseAVX >= 2) {
7184      Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7185      movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7186      movdl(tmp1Reg, tmp5);
7187      vpbroadcastd(tmp1Reg, tmp1Reg);
7188      jmpb(L_chars_32_check);
7189
7190      bind(L_copy_32_chars);
7191      vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7192      vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7193      vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
7194      vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7195      jccb(Assembler::notZero, L_copy_32_chars_exit);
7196      vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
7197      vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector256 */ true);
7198      vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7199
7200      bind(L_chars_32_check);
7201      addptr(len, 32);
7202      jccb(Assembler::lessEqual, L_copy_32_chars);
7203
7204      bind(L_copy_32_chars_exit);
7205      subptr(len, 16);
7206      jccb(Assembler::greater, L_copy_16_chars_exit);
7207
7208    } else if (UseSSE42Intrinsics) {
7209      movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7210      movdl(tmp1Reg, tmp5);
7211      pshufd(tmp1Reg, tmp1Reg, 0);
7212      jmpb(L_chars_16_check);
7213    }
7214
7215    bind(L_copy_16_chars);
7216    if (UseAVX >= 2) {
7217      vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7218      vptest(tmp2Reg, tmp1Reg);
7219      jccb(Assembler::notZero, L_copy_16_chars_exit);
7220      vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector256 */ true);
7221      vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector256 */ true);
7222    } else {
7223      if (UseAVX > 0) {
7224        movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7225        movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7226        vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ false);
7227      } else {
7228        movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7229        por(tmp2Reg, tmp3Reg);
7230        movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7231        por(tmp2Reg, tmp4Reg);
7232      }
7233      ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7234      jccb(Assembler::notZero, L_copy_16_chars_exit);
7235      packuswb(tmp3Reg, tmp4Reg);
7236    }
7237    movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7238
7239    bind(L_chars_16_check);
7240    addptr(len, 16);
7241    jccb(Assembler::lessEqual, L_copy_16_chars);
7242
7243    bind(L_copy_16_chars_exit);
7244    if (UseAVX >= 2) {
7245      // clean upper bits of YMM registers
7246      vzeroupper();
7247    }
7248    subptr(len, 8);
7249    jccb(Assembler::greater, L_copy_8_chars_exit);
7250
7251    bind(L_copy_8_chars);
7252    movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
7253    ptest(tmp3Reg, tmp1Reg);
7254    jccb(Assembler::notZero, L_copy_8_chars_exit);
7255    packuswb(tmp3Reg, tmp1Reg);
7256    movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
7257    addptr(len, 8);
7258    jccb(Assembler::lessEqual, L_copy_8_chars);
7259
7260    bind(L_copy_8_chars_exit);
7261    subptr(len, 8);
7262    jccb(Assembler::zero, L_done);
7263  }
7264
7265  bind(L_copy_1_char);
7266  load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
7267  testl(tmp5, 0xff00);      // check if Unicode char
7268  jccb(Assembler::notZero, L_copy_1_char_exit);
7269  movb(Address(dst, len, Address::times_1, 0), tmp5);
7270  addptr(len, 1);
7271  jccb(Assembler::less, L_copy_1_char);
7272
7273  bind(L_copy_1_char_exit);
7274  addptr(result, len); // len is negative count of not processed elements
7275  bind(L_done);
7276}
7277
7278#ifdef _LP64
7279/**
7280 * Helper for multiply_to_len().
7281 */
7282void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
7283  addq(dest_lo, src1);
7284  adcq(dest_hi, 0);
7285  addq(dest_lo, src2);
7286  adcq(dest_hi, 0);
7287}
7288
7289/**
7290 * Multiply 64 bit by 64 bit first loop.
7291 */
7292void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
7293                                           Register y, Register y_idx, Register z,
7294                                           Register carry, Register product,
7295                                           Register idx, Register kdx) {
7296  //
7297  //  jlong carry, x[], y[], z[];
7298  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7299  //    huge_128 product = y[idx] * x[xstart] + carry;
7300  //    z[kdx] = (jlong)product;
7301  //    carry  = (jlong)(product >>> 64);
7302  //  }
7303  //  z[xstart] = carry;
7304  //
7305
7306  Label L_first_loop, L_first_loop_exit;
7307  Label L_one_x, L_one_y, L_multiply;
7308
7309  decrementl(xstart);
7310  jcc(Assembler::negative, L_one_x);
7311
7312  movq(x_xstart, Address(x, xstart, Address::times_4,  0));
7313  rorq(x_xstart, 32); // convert big-endian to little-endian
7314
7315  bind(L_first_loop);
7316  decrementl(idx);
7317  jcc(Assembler::negative, L_first_loop_exit);
7318  decrementl(idx);
7319  jcc(Assembler::negative, L_one_y);
7320  movq(y_idx, Address(y, idx, Address::times_4,  0));
7321  rorq(y_idx, 32); // convert big-endian to little-endian
7322  bind(L_multiply);
7323  movq(product, x_xstart);
7324  mulq(y_idx); // product(rax) * y_idx -> rdx:rax
7325  addq(product, carry);
7326  adcq(rdx, 0);
7327  subl(kdx, 2);
7328  movl(Address(z, kdx, Address::times_4,  4), product);
7329  shrq(product, 32);
7330  movl(Address(z, kdx, Address::times_4,  0), product);
7331  movq(carry, rdx);
7332  jmp(L_first_loop);
7333
7334  bind(L_one_y);
7335  movl(y_idx, Address(y,  0));
7336  jmp(L_multiply);
7337
7338  bind(L_one_x);
7339  movl(x_xstart, Address(x,  0));
7340  jmp(L_first_loop);
7341
7342  bind(L_first_loop_exit);
7343}
7344
7345/**
7346 * Multiply 64 bit by 64 bit and add 128 bit.
7347 */
7348void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
7349                                            Register yz_idx, Register idx,
7350                                            Register carry, Register product, int offset) {
7351  //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
7352  //     z[kdx] = (jlong)product;
7353
7354  movq(yz_idx, Address(y, idx, Address::times_4,  offset));
7355  rorq(yz_idx, 32); // convert big-endian to little-endian
7356  movq(product, x_xstart);
7357  mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
7358  movq(yz_idx, Address(z, idx, Address::times_4,  offset));
7359  rorq(yz_idx, 32); // convert big-endian to little-endian
7360
7361  add2_with_carry(rdx, product, carry, yz_idx);
7362
7363  movl(Address(z, idx, Address::times_4,  offset+4), product);
7364  shrq(product, 32);
7365  movl(Address(z, idx, Address::times_4,  offset), product);
7366
7367}
7368
7369/**
7370 * Multiply 128 bit by 128 bit. Unrolled inner loop.
7371 */
7372void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
7373                                             Register yz_idx, Register idx, Register jdx,
7374                                             Register carry, Register product,
7375                                             Register carry2) {
7376  //   jlong carry, x[], y[], z[];
7377  //   int kdx = ystart+1;
7378  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7379  //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
7380  //     z[kdx+idx+1] = (jlong)product;
7381  //     jlong carry2  = (jlong)(product >>> 64);
7382  //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
7383  //     z[kdx+idx] = (jlong)product;
7384  //     carry  = (jlong)(product >>> 64);
7385  //   }
7386  //   idx += 2;
7387  //   if (idx > 0) {
7388  //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
7389  //     z[kdx+idx] = (jlong)product;
7390  //     carry  = (jlong)(product >>> 64);
7391  //   }
7392  //
7393
7394  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7395
7396  movl(jdx, idx);
7397  andl(jdx, 0xFFFFFFFC);
7398  shrl(jdx, 2);
7399
7400  bind(L_third_loop);
7401  subl(jdx, 1);
7402  jcc(Assembler::negative, L_third_loop_exit);
7403  subl(idx, 4);
7404
7405  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
7406  movq(carry2, rdx);
7407
7408  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
7409  movq(carry, rdx);
7410  jmp(L_third_loop);
7411
7412  bind (L_third_loop_exit);
7413
7414  andl (idx, 0x3);
7415  jcc(Assembler::zero, L_post_third_loop_done);
7416
7417  Label L_check_1;
7418  subl(idx, 2);
7419  jcc(Assembler::negative, L_check_1);
7420
7421  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
7422  movq(carry, rdx);
7423
7424  bind (L_check_1);
7425  addl (idx, 0x2);
7426  andl (idx, 0x1);
7427  subl(idx, 1);
7428  jcc(Assembler::negative, L_post_third_loop_done);
7429
7430  movl(yz_idx, Address(y, idx, Address::times_4,  0));
7431  movq(product, x_xstart);
7432  mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7433  movl(yz_idx, Address(z, idx, Address::times_4,  0));
7434
7435  add2_with_carry(rdx, product, yz_idx, carry);
7436
7437  movl(Address(z, idx, Address::times_4,  0), product);
7438  shrq(product, 32);
7439
7440  shlq(rdx, 32);
7441  orq(product, rdx);
7442  movq(carry, product);
7443
7444  bind(L_post_third_loop_done);
7445}
7446
7447/**
7448 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
7449 *
7450 */
7451void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
7452                                                  Register carry, Register carry2,
7453                                                  Register idx, Register jdx,
7454                                                  Register yz_idx1, Register yz_idx2,
7455                                                  Register tmp, Register tmp3, Register tmp4) {
7456  assert(UseBMI2Instructions, "should be used only when BMI2 is available");
7457
7458  //   jlong carry, x[], y[], z[];
7459  //   int kdx = ystart+1;
7460  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7461  //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
7462  //     jlong carry2  = (jlong)(tmp3 >>> 64);
7463  //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
7464  //     carry  = (jlong)(tmp4 >>> 64);
7465  //     z[kdx+idx+1] = (jlong)tmp3;
7466  //     z[kdx+idx] = (jlong)tmp4;
7467  //   }
7468  //   idx += 2;
7469  //   if (idx > 0) {
7470  //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
7471  //     z[kdx+idx] = (jlong)yz_idx1;
7472  //     carry  = (jlong)(yz_idx1 >>> 64);
7473  //   }
7474  //
7475
7476  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7477
7478  movl(jdx, idx);
7479  andl(jdx, 0xFFFFFFFC);
7480  shrl(jdx, 2);
7481
7482  bind(L_third_loop);
7483  subl(jdx, 1);
7484  jcc(Assembler::negative, L_third_loop_exit);
7485  subl(idx, 4);
7486
7487  movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
7488  rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
7489  movq(yz_idx2, Address(y, idx, Address::times_4,  0));
7490  rorxq(yz_idx2, yz_idx2, 32);
7491
7492  mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
7493  mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
7494
7495  movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
7496  rorxq(yz_idx1, yz_idx1, 32);
7497  movq(yz_idx2, Address(z, idx, Address::times_4,  0));
7498  rorxq(yz_idx2, yz_idx2, 32);
7499
7500  if (VM_Version::supports_adx()) {
7501    adcxq(tmp3, carry);
7502    adoxq(tmp3, yz_idx1);
7503
7504    adcxq(tmp4, tmp);
7505    adoxq(tmp4, yz_idx2);
7506
7507    movl(carry, 0); // does not affect flags
7508    adcxq(carry2, carry);
7509    adoxq(carry2, carry);
7510  } else {
7511    add2_with_carry(tmp4, tmp3, carry, yz_idx1);
7512    add2_with_carry(carry2, tmp4, tmp, yz_idx2);
7513  }
7514  movq(carry, carry2);
7515
7516  movl(Address(z, idx, Address::times_4, 12), tmp3);
7517  shrq(tmp3, 32);
7518  movl(Address(z, idx, Address::times_4,  8), tmp3);
7519
7520  movl(Address(z, idx, Address::times_4,  4), tmp4);
7521  shrq(tmp4, 32);
7522  movl(Address(z, idx, Address::times_4,  0), tmp4);
7523
7524  jmp(L_third_loop);
7525
7526  bind (L_third_loop_exit);
7527
7528  andl (idx, 0x3);
7529  jcc(Assembler::zero, L_post_third_loop_done);
7530
7531  Label L_check_1;
7532  subl(idx, 2);
7533  jcc(Assembler::negative, L_check_1);
7534
7535  movq(yz_idx1, Address(y, idx, Address::times_4,  0));
7536  rorxq(yz_idx1, yz_idx1, 32);
7537  mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
7538  movq(yz_idx2, Address(z, idx, Address::times_4,  0));
7539  rorxq(yz_idx2, yz_idx2, 32);
7540
7541  add2_with_carry(tmp4, tmp3, carry, yz_idx2);
7542
7543  movl(Address(z, idx, Address::times_4,  4), tmp3);
7544  shrq(tmp3, 32);
7545  movl(Address(z, idx, Address::times_4,  0), tmp3);
7546  movq(carry, tmp4);
7547
7548  bind (L_check_1);
7549  addl (idx, 0x2);
7550  andl (idx, 0x1);
7551  subl(idx, 1);
7552  jcc(Assembler::negative, L_post_third_loop_done);
7553  movl(tmp4, Address(y, idx, Address::times_4,  0));
7554  mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
7555  movl(tmp4, Address(z, idx, Address::times_4,  0));
7556
7557  add2_with_carry(carry2, tmp3, tmp4, carry);
7558
7559  movl(Address(z, idx, Address::times_4,  0), tmp3);
7560  shrq(tmp3, 32);
7561
7562  shlq(carry2, 32);
7563  orq(tmp3, carry2);
7564  movq(carry, tmp3);
7565
7566  bind(L_post_third_loop_done);
7567}
7568
7569/**
7570 * Code for BigInteger::multiplyToLen() instrinsic.
7571 *
7572 * rdi: x
7573 * rax: xlen
7574 * rsi: y
7575 * rcx: ylen
7576 * r8:  z
7577 * r11: zlen
7578 * r12: tmp1
7579 * r13: tmp2
7580 * r14: tmp3
7581 * r15: tmp4
7582 * rbx: tmp5
7583 *
7584 */
7585void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
7586                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
7587  ShortBranchVerifier sbv(this);
7588  assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
7589
7590  push(tmp1);
7591  push(tmp2);
7592  push(tmp3);
7593  push(tmp4);
7594  push(tmp5);
7595
7596  push(xlen);
7597  push(zlen);
7598
7599  const Register idx = tmp1;
7600  const Register kdx = tmp2;
7601  const Register xstart = tmp3;
7602
7603  const Register y_idx = tmp4;
7604  const Register carry = tmp5;
7605  const Register product  = xlen;
7606  const Register x_xstart = zlen;  // reuse register
7607
7608  // First Loop.
7609  //
7610  //  final static long LONG_MASK = 0xffffffffL;
7611  //  int xstart = xlen - 1;
7612  //  int ystart = ylen - 1;
7613  //  long carry = 0;
7614  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7615  //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
7616  //    z[kdx] = (int)product;
7617  //    carry = product >>> 32;
7618  //  }
7619  //  z[xstart] = (int)carry;
7620  //
7621
7622  movl(idx, ylen);      // idx = ylen;
7623  movl(kdx, zlen);      // kdx = xlen+ylen;
7624  xorq(carry, carry);   // carry = 0;
7625
7626  Label L_done;
7627
7628  movl(xstart, xlen);
7629  decrementl(xstart);
7630  jcc(Assembler::negative, L_done);
7631
7632  multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
7633
7634  Label L_second_loop;
7635  testl(kdx, kdx);
7636  jcc(Assembler::zero, L_second_loop);
7637
7638  Label L_carry;
7639  subl(kdx, 1);
7640  jcc(Assembler::zero, L_carry);
7641
7642  movl(Address(z, kdx, Address::times_4,  0), carry);
7643  shrq(carry, 32);
7644  subl(kdx, 1);
7645
7646  bind(L_carry);
7647  movl(Address(z, kdx, Address::times_4,  0), carry);
7648
7649  // Second and third (nested) loops.
7650  //
7651  // for (int i = xstart-1; i >= 0; i--) { // Second loop
7652  //   carry = 0;
7653  //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
7654  //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
7655  //                    (z[k] & LONG_MASK) + carry;
7656  //     z[k] = (int)product;
7657  //     carry = product >>> 32;
7658  //   }
7659  //   z[i] = (int)carry;
7660  // }
7661  //
7662  // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
7663
7664  const Register jdx = tmp1;
7665
7666  bind(L_second_loop);
7667  xorl(carry, carry);    // carry = 0;
7668  movl(jdx, ylen);       // j = ystart+1
7669
7670  subl(xstart, 1);       // i = xstart-1;
7671  jcc(Assembler::negative, L_done);
7672
7673  push (z);
7674
7675  Label L_last_x;
7676  lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
7677  subl(xstart, 1);       // i = xstart-1;
7678  jcc(Assembler::negative, L_last_x);
7679
7680  if (UseBMI2Instructions) {
7681    movq(rdx,  Address(x, xstart, Address::times_4,  0));
7682    rorxq(rdx, rdx, 32); // convert big-endian to little-endian
7683  } else {
7684    movq(x_xstart, Address(x, xstart, Address::times_4,  0));
7685    rorq(x_xstart, 32);  // convert big-endian to little-endian
7686  }
7687
7688  Label L_third_loop_prologue;
7689  bind(L_third_loop_prologue);
7690
7691  push (x);
7692  push (xstart);
7693  push (ylen);
7694
7695
7696  if (UseBMI2Instructions) {
7697    multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
7698  } else { // !UseBMI2Instructions
7699    multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
7700  }
7701
7702  pop(ylen);
7703  pop(xlen);
7704  pop(x);
7705  pop(z);
7706
7707  movl(tmp3, xlen);
7708  addl(tmp3, 1);
7709  movl(Address(z, tmp3, Address::times_4,  0), carry);
7710  subl(tmp3, 1);
7711  jccb(Assembler::negative, L_done);
7712
7713  shrq(carry, 32);
7714  movl(Address(z, tmp3, Address::times_4,  0), carry);
7715  jmp(L_second_loop);
7716
7717  // Next infrequent code is moved outside loops.
7718  bind(L_last_x);
7719  if (UseBMI2Instructions) {
7720    movl(rdx, Address(x,  0));
7721  } else {
7722    movl(x_xstart, Address(x,  0));
7723  }
7724  jmp(L_third_loop_prologue);
7725
7726  bind(L_done);
7727
7728  pop(zlen);
7729  pop(xlen);
7730
7731  pop(tmp5);
7732  pop(tmp4);
7733  pop(tmp3);
7734  pop(tmp2);
7735  pop(tmp1);
7736}
7737#endif
7738
7739/**
7740 * Emits code to update CRC-32 with a byte value according to constants in table
7741 *
7742 * @param [in,out]crc   Register containing the crc.
7743 * @param [in]val       Register containing the byte to fold into the CRC.
7744 * @param [in]table     Register containing the table of crc constants.
7745 *
7746 * uint32_t crc;
7747 * val = crc_table[(val ^ crc) & 0xFF];
7748 * crc = val ^ (crc >> 8);
7749 *
7750 */
7751void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7752  xorl(val, crc);
7753  andl(val, 0xFF);
7754  shrl(crc, 8); // unsigned shift
7755  xorl(crc, Address(table, val, Address::times_4, 0));
7756}
7757
7758/**
7759 * Fold 128-bit data chunk
7760 */
7761void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7762  if (UseAVX > 0) {
7763    vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7764    vpclmulldq(xcrc, xK, xcrc); // [63:0]
7765    vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
7766    pxor(xcrc, xtmp);
7767  } else {
7768    movdqa(xtmp, xcrc);
7769    pclmulhdq(xtmp, xK);   // [123:64]
7770    pclmulldq(xcrc, xK);   // [63:0]
7771    pxor(xcrc, xtmp);
7772    movdqu(xtmp, Address(buf, offset));
7773    pxor(xcrc, xtmp);
7774  }
7775}
7776
7777void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7778  if (UseAVX > 0) {
7779    vpclmulhdq(xtmp, xK, xcrc);
7780    vpclmulldq(xcrc, xK, xcrc);
7781    pxor(xcrc, xbuf);
7782    pxor(xcrc, xtmp);
7783  } else {
7784    movdqa(xtmp, xcrc);
7785    pclmulhdq(xtmp, xK);
7786    pclmulldq(xcrc, xK);
7787    pxor(xcrc, xbuf);
7788    pxor(xcrc, xtmp);
7789  }
7790}
7791
7792/**
7793 * 8-bit folds to compute 32-bit CRC
7794 *
7795 * uint64_t xcrc;
7796 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7797 */
7798void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7799  movdl(tmp, xcrc);
7800  andl(tmp, 0xFF);
7801  movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7802  psrldq(xcrc, 1); // unsigned shift one byte
7803  pxor(xcrc, xtmp);
7804}
7805
7806/**
7807 * uint32_t crc;
7808 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7809 */
7810void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7811  movl(tmp, crc);
7812  andl(tmp, 0xFF);
7813  shrl(crc, 8);
7814  xorl(crc, Address(table, tmp, Address::times_4, 0));
7815}
7816
7817/**
7818 * @param crc   register containing existing CRC (32-bit)
7819 * @param buf   register pointing to input byte buffer (byte*)
7820 * @param len   register containing number of bytes
7821 * @param table register that will contain address of CRC table
7822 * @param tmp   scratch register
7823 */
7824void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7825  assert_different_registers(crc, buf, len, table, tmp, rax);
7826
7827  Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7828  Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7829
7830  lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7831  notl(crc); // ~crc
7832  cmpl(len, 16);
7833  jcc(Assembler::less, L_tail);
7834
7835  // Align buffer to 16 bytes
7836  movl(tmp, buf);
7837  andl(tmp, 0xF);
7838  jccb(Assembler::zero, L_aligned);
7839  subl(tmp,  16);
7840  addl(len, tmp);
7841
7842  align(4);
7843  BIND(L_align_loop);
7844  movsbl(rax, Address(buf, 0)); // load byte with sign extension
7845  update_byte_crc32(crc, rax, table);
7846  increment(buf);
7847  incrementl(tmp);
7848  jccb(Assembler::less, L_align_loop);
7849
7850  BIND(L_aligned);
7851  movl(tmp, len); // save
7852  shrl(len, 4);
7853  jcc(Assembler::zero, L_tail_restore);
7854
7855  // Fold crc into first bytes of vector
7856  movdqa(xmm1, Address(buf, 0));
7857  movdl(rax, xmm1);
7858  xorl(crc, rax);
7859  pinsrd(xmm1, crc, 0);
7860  addptr(buf, 16);
7861  subl(len, 4); // len > 0
7862  jcc(Assembler::less, L_fold_tail);
7863
7864  movdqa(xmm2, Address(buf,  0));
7865  movdqa(xmm3, Address(buf, 16));
7866  movdqa(xmm4, Address(buf, 32));
7867  addptr(buf, 48);
7868  subl(len, 3);
7869  jcc(Assembler::lessEqual, L_fold_512b);
7870
7871  // Fold total 512 bits of polynomial on each iteration,
7872  // 128 bits per each of 4 parallel streams.
7873  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
7874
7875  align(32);
7876  BIND(L_fold_512b_loop);
7877  fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7878  fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7879  fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7880  fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7881  addptr(buf, 64);
7882  subl(len, 4);
7883  jcc(Assembler::greater, L_fold_512b_loop);
7884
7885  // Fold 512 bits to 128 bits.
7886  BIND(L_fold_512b);
7887  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7888  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7889  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7890  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7891
7892  // Fold the rest of 128 bits data chunks
7893  BIND(L_fold_tail);
7894  addl(len, 3);
7895  jccb(Assembler::lessEqual, L_fold_128b);
7896  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7897
7898  BIND(L_fold_tail_loop);
7899  fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7900  addptr(buf, 16);
7901  decrementl(len);
7902  jccb(Assembler::greater, L_fold_tail_loop);
7903
7904  // Fold 128 bits in xmm1 down into 32 bits in crc register.
7905  BIND(L_fold_128b);
7906  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7907  if (UseAVX > 0) {
7908    vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7909    vpand(xmm3, xmm0, xmm2, false /* vector256 */);
7910    vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7911  } else {
7912    movdqa(xmm2, xmm0);
7913    pclmulqdq(xmm2, xmm1, 0x1);
7914    movdqa(xmm3, xmm0);
7915    pand(xmm3, xmm2);
7916    pclmulqdq(xmm0, xmm3, 0x1);
7917  }
7918  psrldq(xmm1, 8);
7919  psrldq(xmm2, 4);
7920  pxor(xmm0, xmm1);
7921  pxor(xmm0, xmm2);
7922
7923  // 8 8-bit folds to compute 32-bit CRC.
7924  for (int j = 0; j < 4; j++) {
7925    fold_8bit_crc32(xmm0, table, xmm1, rax);
7926  }
7927  movdl(crc, xmm0); // mov 32 bits to general register
7928  for (int j = 0; j < 4; j++) {
7929    fold_8bit_crc32(crc, table, rax);
7930  }
7931
7932  BIND(L_tail_restore);
7933  movl(len, tmp); // restore
7934  BIND(L_tail);
7935  andl(len, 0xf);
7936  jccb(Assembler::zero, L_exit);
7937
7938  // Fold the rest of bytes
7939  align(4);
7940  BIND(L_tail_loop);
7941  movsbl(rax, Address(buf, 0)); // load byte with sign extension
7942  update_byte_crc32(crc, rax, table);
7943  increment(buf);
7944  decrementl(len);
7945  jccb(Assembler::greater, L_tail_loop);
7946
7947  BIND(L_exit);
7948  notl(crc); // ~c
7949}
7950
7951#undef BIND
7952#undef BLOCK_COMMENT
7953
7954
7955Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
7956  switch (cond) {
7957    // Note some conditions are synonyms for others
7958    case Assembler::zero:         return Assembler::notZero;
7959    case Assembler::notZero:      return Assembler::zero;
7960    case Assembler::less:         return Assembler::greaterEqual;
7961    case Assembler::lessEqual:    return Assembler::greater;
7962    case Assembler::greater:      return Assembler::lessEqual;
7963    case Assembler::greaterEqual: return Assembler::less;
7964    case Assembler::below:        return Assembler::aboveEqual;
7965    case Assembler::belowEqual:   return Assembler::above;
7966    case Assembler::above:        return Assembler::belowEqual;
7967    case Assembler::aboveEqual:   return Assembler::below;
7968    case Assembler::overflow:     return Assembler::noOverflow;
7969    case Assembler::noOverflow:   return Assembler::overflow;
7970    case Assembler::negative:     return Assembler::positive;
7971    case Assembler::positive:     return Assembler::negative;
7972    case Assembler::parity:       return Assembler::noParity;
7973    case Assembler::noParity:     return Assembler::parity;
7974  }
7975  ShouldNotReachHere(); return Assembler::overflow;
7976}
7977
7978SkipIfEqual::SkipIfEqual(
7979    MacroAssembler* masm, const bool* flag_addr, bool value) {
7980  _masm = masm;
7981  _masm->cmp8(ExternalAddress((address)flag_addr), value);
7982  _masm->jcc(Assembler::equal, _label);
7983}
7984
7985SkipIfEqual::~SkipIfEqual() {
7986  _masm->bind(_label);
7987}
7988