macroAssembler_x86.cpp revision 6412:53a41e7cbe05
1/*
2 * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "asm/assembler.hpp"
27#include "asm/assembler.inline.hpp"
28#include "compiler/disassembler.hpp"
29#include "gc_interface/collectedHeap.inline.hpp"
30#include "interpreter/interpreter.hpp"
31#include "memory/cardTableModRefBS.hpp"
32#include "memory/resourceArea.hpp"
33#include "memory/universe.hpp"
34#include "prims/methodHandles.hpp"
35#include "runtime/biasedLocking.hpp"
36#include "runtime/interfaceSupport.hpp"
37#include "runtime/objectMonitor.hpp"
38#include "runtime/os.hpp"
39#include "runtime/sharedRuntime.hpp"
40#include "runtime/stubRoutines.hpp"
41#include "utilities/macros.hpp"
42#if INCLUDE_ALL_GCS
43#include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
44#include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
45#include "gc_implementation/g1/heapRegion.hpp"
46#endif // INCLUDE_ALL_GCS
47
48#ifdef PRODUCT
49#define BLOCK_COMMENT(str) /* nothing */
50#define STOP(error) stop(error)
51#else
52#define BLOCK_COMMENT(str) block_comment(str)
53#define STOP(error) block_comment(error); stop(error)
54#endif
55
56#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
57
58PRAGMA_FORMAT_MUTE_WARNINGS_FOR_GCC
59
60#ifdef ASSERT
61bool AbstractAssembler::pd_check_instruction_mark() { return true; }
62#endif
63
64static Assembler::Condition reverse[] = {
65    Assembler::noOverflow     /* overflow      = 0x0 */ ,
66    Assembler::overflow       /* noOverflow    = 0x1 */ ,
67    Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
68    Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
69    Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
70    Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
71    Assembler::above          /* belowEqual    = 0x6 */ ,
72    Assembler::belowEqual     /* above         = 0x7 */ ,
73    Assembler::positive       /* negative      = 0x8 */ ,
74    Assembler::negative       /* positive      = 0x9 */ ,
75    Assembler::noParity       /* parity        = 0xa */ ,
76    Assembler::parity         /* noParity      = 0xb */ ,
77    Assembler::greaterEqual   /* less          = 0xc */ ,
78    Assembler::less           /* greaterEqual  = 0xd */ ,
79    Assembler::greater        /* lessEqual     = 0xe */ ,
80    Assembler::lessEqual      /* greater       = 0xf, */
81
82};
83
84
85// Implementation of MacroAssembler
86
87// First all the versions that have distinct versions depending on 32/64 bit
88// Unless the difference is trivial (1 line or so).
89
90#ifndef _LP64
91
92// 32bit versions
93
94Address MacroAssembler::as_Address(AddressLiteral adr) {
95  return Address(adr.target(), adr.rspec());
96}
97
98Address MacroAssembler::as_Address(ArrayAddress adr) {
99  return Address::make_array(adr);
100}
101
102void MacroAssembler::call_VM_leaf_base(address entry_point,
103                                       int number_of_arguments) {
104  call(RuntimeAddress(entry_point));
105  increment(rsp, number_of_arguments * wordSize);
106}
107
108void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
109  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
110}
111
112void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
113  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
114}
115
116void MacroAssembler::cmpoop(Address src1, jobject obj) {
117  cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
118}
119
120void MacroAssembler::cmpoop(Register src1, jobject obj) {
121  cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
122}
123
124void MacroAssembler::extend_sign(Register hi, Register lo) {
125  // According to Intel Doc. AP-526, "Integer Divide", p.18.
126  if (VM_Version::is_P6() && hi == rdx && lo == rax) {
127    cdql();
128  } else {
129    movl(hi, lo);
130    sarl(hi, 31);
131  }
132}
133
134void MacroAssembler::jC2(Register tmp, Label& L) {
135  // set parity bit if FPU flag C2 is set (via rax)
136  save_rax(tmp);
137  fwait(); fnstsw_ax();
138  sahf();
139  restore_rax(tmp);
140  // branch
141  jcc(Assembler::parity, L);
142}
143
144void MacroAssembler::jnC2(Register tmp, Label& L) {
145  // set parity bit if FPU flag C2 is set (via rax)
146  save_rax(tmp);
147  fwait(); fnstsw_ax();
148  sahf();
149  restore_rax(tmp);
150  // branch
151  jcc(Assembler::noParity, L);
152}
153
154// 32bit can do a case table jump in one instruction but we no longer allow the base
155// to be installed in the Address class
156void MacroAssembler::jump(ArrayAddress entry) {
157  jmp(as_Address(entry));
158}
159
160// Note: y_lo will be destroyed
161void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
162  // Long compare for Java (semantics as described in JVM spec.)
163  Label high, low, done;
164
165  cmpl(x_hi, y_hi);
166  jcc(Assembler::less, low);
167  jcc(Assembler::greater, high);
168  // x_hi is the return register
169  xorl(x_hi, x_hi);
170  cmpl(x_lo, y_lo);
171  jcc(Assembler::below, low);
172  jcc(Assembler::equal, done);
173
174  bind(high);
175  xorl(x_hi, x_hi);
176  increment(x_hi);
177  jmp(done);
178
179  bind(low);
180  xorl(x_hi, x_hi);
181  decrementl(x_hi);
182
183  bind(done);
184}
185
186void MacroAssembler::lea(Register dst, AddressLiteral src) {
187    mov_literal32(dst, (int32_t)src.target(), src.rspec());
188}
189
190void MacroAssembler::lea(Address dst, AddressLiteral adr) {
191  // leal(dst, as_Address(adr));
192  // see note in movl as to why we must use a move
193  mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
194}
195
196void MacroAssembler::leave() {
197  mov(rsp, rbp);
198  pop(rbp);
199}
200
201void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
202  // Multiplication of two Java long values stored on the stack
203  // as illustrated below. Result is in rdx:rax.
204  //
205  // rsp ---> [  ??  ] \               \
206  //            ....    | y_rsp_offset  |
207  //          [ y_lo ] /  (in bytes)    | x_rsp_offset
208  //          [ y_hi ]                  | (in bytes)
209  //            ....                    |
210  //          [ x_lo ]                 /
211  //          [ x_hi ]
212  //            ....
213  //
214  // Basic idea: lo(result) = lo(x_lo * y_lo)
215  //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
216  Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
217  Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
218  Label quick;
219  // load x_hi, y_hi and check if quick
220  // multiplication is possible
221  movl(rbx, x_hi);
222  movl(rcx, y_hi);
223  movl(rax, rbx);
224  orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
225  jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
226  // do full multiplication
227  // 1st step
228  mull(y_lo);                                    // x_hi * y_lo
229  movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
230  // 2nd step
231  movl(rax, x_lo);
232  mull(rcx);                                     // x_lo * y_hi
233  addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
234  // 3rd step
235  bind(quick);                                   // note: rbx, = 0 if quick multiply!
236  movl(rax, x_lo);
237  mull(y_lo);                                    // x_lo * y_lo
238  addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
239}
240
241void MacroAssembler::lneg(Register hi, Register lo) {
242  negl(lo);
243  adcl(hi, 0);
244  negl(hi);
245}
246
247void MacroAssembler::lshl(Register hi, Register lo) {
248  // Java shift left long support (semantics as described in JVM spec., p.305)
249  // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
250  // shift value is in rcx !
251  assert(hi != rcx, "must not use rcx");
252  assert(lo != rcx, "must not use rcx");
253  const Register s = rcx;                        // shift count
254  const int      n = BitsPerWord;
255  Label L;
256  andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
257  cmpl(s, n);                                    // if (s < n)
258  jcc(Assembler::less, L);                       // else (s >= n)
259  movl(hi, lo);                                  // x := x << n
260  xorl(lo, lo);
261  // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
262  bind(L);                                       // s (mod n) < n
263  shldl(hi, lo);                                 // x := x << s
264  shll(lo);
265}
266
267
268void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
269  // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
270  // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
271  assert(hi != rcx, "must not use rcx");
272  assert(lo != rcx, "must not use rcx");
273  const Register s = rcx;                        // shift count
274  const int      n = BitsPerWord;
275  Label L;
276  andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
277  cmpl(s, n);                                    // if (s < n)
278  jcc(Assembler::less, L);                       // else (s >= n)
279  movl(lo, hi);                                  // x := x >> n
280  if (sign_extension) sarl(hi, 31);
281  else                xorl(hi, hi);
282  // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
283  bind(L);                                       // s (mod n) < n
284  shrdl(lo, hi);                                 // x := x >> s
285  if (sign_extension) sarl(hi);
286  else                shrl(hi);
287}
288
289void MacroAssembler::movoop(Register dst, jobject obj) {
290  mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
291}
292
293void MacroAssembler::movoop(Address dst, jobject obj) {
294  mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
295}
296
297void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
298  mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
299}
300
301void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
302  mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
303}
304
305void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
306  // scratch register is not used,
307  // it is defined to match parameters of 64-bit version of this method.
308  if (src.is_lval()) {
309    mov_literal32(dst, (intptr_t)src.target(), src.rspec());
310  } else {
311    movl(dst, as_Address(src));
312  }
313}
314
315void MacroAssembler::movptr(ArrayAddress dst, Register src) {
316  movl(as_Address(dst), src);
317}
318
319void MacroAssembler::movptr(Register dst, ArrayAddress src) {
320  movl(dst, as_Address(src));
321}
322
323// src should NEVER be a real pointer. Use AddressLiteral for true pointers
324void MacroAssembler::movptr(Address dst, intptr_t src) {
325  movl(dst, src);
326}
327
328
329void MacroAssembler::pop_callee_saved_registers() {
330  pop(rcx);
331  pop(rdx);
332  pop(rdi);
333  pop(rsi);
334}
335
336void MacroAssembler::pop_fTOS() {
337  fld_d(Address(rsp, 0));
338  addl(rsp, 2 * wordSize);
339}
340
341void MacroAssembler::push_callee_saved_registers() {
342  push(rsi);
343  push(rdi);
344  push(rdx);
345  push(rcx);
346}
347
348void MacroAssembler::push_fTOS() {
349  subl(rsp, 2 * wordSize);
350  fstp_d(Address(rsp, 0));
351}
352
353
354void MacroAssembler::pushoop(jobject obj) {
355  push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
356}
357
358void MacroAssembler::pushklass(Metadata* obj) {
359  push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
360}
361
362void MacroAssembler::pushptr(AddressLiteral src) {
363  if (src.is_lval()) {
364    push_literal32((int32_t)src.target(), src.rspec());
365  } else {
366    pushl(as_Address(src));
367  }
368}
369
370void MacroAssembler::set_word_if_not_zero(Register dst) {
371  xorl(dst, dst);
372  set_byte_if_not_zero(dst);
373}
374
375static void pass_arg0(MacroAssembler* masm, Register arg) {
376  masm->push(arg);
377}
378
379static void pass_arg1(MacroAssembler* masm, Register arg) {
380  masm->push(arg);
381}
382
383static void pass_arg2(MacroAssembler* masm, Register arg) {
384  masm->push(arg);
385}
386
387static void pass_arg3(MacroAssembler* masm, Register arg) {
388  masm->push(arg);
389}
390
391#ifndef PRODUCT
392extern "C" void findpc(intptr_t x);
393#endif
394
395void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
396  // In order to get locks to work, we need to fake a in_VM state
397  JavaThread* thread = JavaThread::current();
398  JavaThreadState saved_state = thread->thread_state();
399  thread->set_thread_state(_thread_in_vm);
400  if (ShowMessageBoxOnError) {
401    JavaThread* thread = JavaThread::current();
402    JavaThreadState saved_state = thread->thread_state();
403    thread->set_thread_state(_thread_in_vm);
404    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
405      ttyLocker ttyl;
406      BytecodeCounter::print();
407    }
408    // To see where a verify_oop failed, get $ebx+40/X for this frame.
409    // This is the value of eip which points to where verify_oop will return.
410    if (os::message_box(msg, "Execution stopped, print registers?")) {
411      print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
412      BREAKPOINT;
413    }
414  } else {
415    ttyLocker ttyl;
416    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
417  }
418  // Don't assert holding the ttyLock
419    assert(false, err_msg("DEBUG MESSAGE: %s", msg));
420  ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
421}
422
423void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
424  ttyLocker ttyl;
425  FlagSetting fs(Debugging, true);
426  tty->print_cr("eip = 0x%08x", eip);
427#ifndef PRODUCT
428  if ((WizardMode || Verbose) && PrintMiscellaneous) {
429    tty->cr();
430    findpc(eip);
431    tty->cr();
432  }
433#endif
434#define PRINT_REG(rax) \
435  { tty->print("%s = ", #rax); os::print_location(tty, rax); }
436  PRINT_REG(rax);
437  PRINT_REG(rbx);
438  PRINT_REG(rcx);
439  PRINT_REG(rdx);
440  PRINT_REG(rdi);
441  PRINT_REG(rsi);
442  PRINT_REG(rbp);
443  PRINT_REG(rsp);
444#undef PRINT_REG
445  // Print some words near top of staack.
446  int* dump_sp = (int*) rsp;
447  for (int col1 = 0; col1 < 8; col1++) {
448    tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
449    os::print_location(tty, *dump_sp++);
450  }
451  for (int row = 0; row < 16; row++) {
452    tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
453    for (int col = 0; col < 8; col++) {
454      tty->print(" 0x%08x", *dump_sp++);
455    }
456    tty->cr();
457  }
458  // Print some instructions around pc:
459  Disassembler::decode((address)eip-64, (address)eip);
460  tty->print_cr("--------");
461  Disassembler::decode((address)eip, (address)eip+32);
462}
463
464void MacroAssembler::stop(const char* msg) {
465  ExternalAddress message((address)msg);
466  // push address of message
467  pushptr(message.addr());
468  { Label L; call(L, relocInfo::none); bind(L); }     // push eip
469  pusha();                                            // push registers
470  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
471  hlt();
472}
473
474void MacroAssembler::warn(const char* msg) {
475  push_CPU_state();
476
477  ExternalAddress message((address) msg);
478  // push address of message
479  pushptr(message.addr());
480
481  call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
482  addl(rsp, wordSize);       // discard argument
483  pop_CPU_state();
484}
485
486void MacroAssembler::print_state() {
487  { Label L; call(L, relocInfo::none); bind(L); }     // push eip
488  pusha();                                            // push registers
489
490  push_CPU_state();
491  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
492  pop_CPU_state();
493
494  popa();
495  addl(rsp, wordSize);
496}
497
498#else // _LP64
499
500// 64 bit versions
501
502Address MacroAssembler::as_Address(AddressLiteral adr) {
503  // amd64 always does this as a pc-rel
504  // we can be absolute or disp based on the instruction type
505  // jmp/call are displacements others are absolute
506  assert(!adr.is_lval(), "must be rval");
507  assert(reachable(adr), "must be");
508  return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
509
510}
511
512Address MacroAssembler::as_Address(ArrayAddress adr) {
513  AddressLiteral base = adr.base();
514  lea(rscratch1, base);
515  Address index = adr.index();
516  assert(index._disp == 0, "must not have disp"); // maybe it can?
517  Address array(rscratch1, index._index, index._scale, index._disp);
518  return array;
519}
520
521void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
522  Label L, E;
523
524#ifdef _WIN64
525  // Windows always allocates space for it's register args
526  assert(num_args <= 4, "only register arguments supported");
527  subq(rsp,  frame::arg_reg_save_area_bytes);
528#endif
529
530  // Align stack if necessary
531  testl(rsp, 15);
532  jcc(Assembler::zero, L);
533
534  subq(rsp, 8);
535  {
536    call(RuntimeAddress(entry_point));
537  }
538  addq(rsp, 8);
539  jmp(E);
540
541  bind(L);
542  {
543    call(RuntimeAddress(entry_point));
544  }
545
546  bind(E);
547
548#ifdef _WIN64
549  // restore stack pointer
550  addq(rsp, frame::arg_reg_save_area_bytes);
551#endif
552
553}
554
555void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
556  assert(!src2.is_lval(), "should use cmpptr");
557
558  if (reachable(src2)) {
559    cmpq(src1, as_Address(src2));
560  } else {
561    lea(rscratch1, src2);
562    Assembler::cmpq(src1, Address(rscratch1, 0));
563  }
564}
565
566int MacroAssembler::corrected_idivq(Register reg) {
567  // Full implementation of Java ldiv and lrem; checks for special
568  // case as described in JVM spec., p.243 & p.271.  The function
569  // returns the (pc) offset of the idivl instruction - may be needed
570  // for implicit exceptions.
571  //
572  //         normal case                           special case
573  //
574  // input : rax: dividend                         min_long
575  //         reg: divisor   (may not be eax/edx)   -1
576  //
577  // output: rax: quotient  (= rax idiv reg)       min_long
578  //         rdx: remainder (= rax irem reg)       0
579  assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
580  static const int64_t min_long = 0x8000000000000000;
581  Label normal_case, special_case;
582
583  // check for special case
584  cmp64(rax, ExternalAddress((address) &min_long));
585  jcc(Assembler::notEqual, normal_case);
586  xorl(rdx, rdx); // prepare rdx for possible special case (where
587                  // remainder = 0)
588  cmpq(reg, -1);
589  jcc(Assembler::equal, special_case);
590
591  // handle normal case
592  bind(normal_case);
593  cdqq();
594  int idivq_offset = offset();
595  idivq(reg);
596
597  // normal and special case exit
598  bind(special_case);
599
600  return idivq_offset;
601}
602
603void MacroAssembler::decrementq(Register reg, int value) {
604  if (value == min_jint) { subq(reg, value); return; }
605  if (value <  0) { incrementq(reg, -value); return; }
606  if (value == 0) {                        ; return; }
607  if (value == 1 && UseIncDec) { decq(reg) ; return; }
608  /* else */      { subq(reg, value)       ; return; }
609}
610
611void MacroAssembler::decrementq(Address dst, int value) {
612  if (value == min_jint) { subq(dst, value); return; }
613  if (value <  0) { incrementq(dst, -value); return; }
614  if (value == 0) {                        ; return; }
615  if (value == 1 && UseIncDec) { decq(dst) ; return; }
616  /* else */      { subq(dst, value)       ; return; }
617}
618
619void MacroAssembler::incrementq(AddressLiteral dst) {
620  if (reachable(dst)) {
621    incrementq(as_Address(dst));
622  } else {
623    lea(rscratch1, dst);
624    incrementq(Address(rscratch1, 0));
625  }
626}
627
628void MacroAssembler::incrementq(Register reg, int value) {
629  if (value == min_jint) { addq(reg, value); return; }
630  if (value <  0) { decrementq(reg, -value); return; }
631  if (value == 0) {                        ; return; }
632  if (value == 1 && UseIncDec) { incq(reg) ; return; }
633  /* else */      { addq(reg, value)       ; return; }
634}
635
636void MacroAssembler::incrementq(Address dst, int value) {
637  if (value == min_jint) { addq(dst, value); return; }
638  if (value <  0) { decrementq(dst, -value); return; }
639  if (value == 0) {                        ; return; }
640  if (value == 1 && UseIncDec) { incq(dst) ; return; }
641  /* else */      { addq(dst, value)       ; return; }
642}
643
644// 32bit can do a case table jump in one instruction but we no longer allow the base
645// to be installed in the Address class
646void MacroAssembler::jump(ArrayAddress entry) {
647  lea(rscratch1, entry.base());
648  Address dispatch = entry.index();
649  assert(dispatch._base == noreg, "must be");
650  dispatch._base = rscratch1;
651  jmp(dispatch);
652}
653
654void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
655  ShouldNotReachHere(); // 64bit doesn't use two regs
656  cmpq(x_lo, y_lo);
657}
658
659void MacroAssembler::lea(Register dst, AddressLiteral src) {
660    mov_literal64(dst, (intptr_t)src.target(), src.rspec());
661}
662
663void MacroAssembler::lea(Address dst, AddressLiteral adr) {
664  mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
665  movptr(dst, rscratch1);
666}
667
668void MacroAssembler::leave() {
669  // %%% is this really better? Why not on 32bit too?
670  emit_int8((unsigned char)0xC9); // LEAVE
671}
672
673void MacroAssembler::lneg(Register hi, Register lo) {
674  ShouldNotReachHere(); // 64bit doesn't use two regs
675  negq(lo);
676}
677
678void MacroAssembler::movoop(Register dst, jobject obj) {
679  mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
680}
681
682void MacroAssembler::movoop(Address dst, jobject obj) {
683  mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
684  movq(dst, rscratch1);
685}
686
687void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
688  mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
689}
690
691void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
692  mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
693  movq(dst, rscratch1);
694}
695
696void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
697  if (src.is_lval()) {
698    mov_literal64(dst, (intptr_t)src.target(), src.rspec());
699  } else {
700    if (reachable(src)) {
701      movq(dst, as_Address(src));
702    } else {
703      lea(scratch, src);
704      movq(dst, Address(scratch, 0));
705    }
706  }
707}
708
709void MacroAssembler::movptr(ArrayAddress dst, Register src) {
710  movq(as_Address(dst), src);
711}
712
713void MacroAssembler::movptr(Register dst, ArrayAddress src) {
714  movq(dst, as_Address(src));
715}
716
717// src should NEVER be a real pointer. Use AddressLiteral for true pointers
718void MacroAssembler::movptr(Address dst, intptr_t src) {
719  mov64(rscratch1, src);
720  movq(dst, rscratch1);
721}
722
723// These are mostly for initializing NULL
724void MacroAssembler::movptr(Address dst, int32_t src) {
725  movslq(dst, src);
726}
727
728void MacroAssembler::movptr(Register dst, int32_t src) {
729  mov64(dst, (intptr_t)src);
730}
731
732void MacroAssembler::pushoop(jobject obj) {
733  movoop(rscratch1, obj);
734  push(rscratch1);
735}
736
737void MacroAssembler::pushklass(Metadata* obj) {
738  mov_metadata(rscratch1, obj);
739  push(rscratch1);
740}
741
742void MacroAssembler::pushptr(AddressLiteral src) {
743  lea(rscratch1, src);
744  if (src.is_lval()) {
745    push(rscratch1);
746  } else {
747    pushq(Address(rscratch1, 0));
748  }
749}
750
751void MacroAssembler::reset_last_Java_frame(bool clear_fp,
752                                           bool clear_pc) {
753  // we must set sp to zero to clear frame
754  movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
755  // must clear fp, so that compiled frames are not confused; it is
756  // possible that we need it only for debugging
757  if (clear_fp) {
758    movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
759  }
760
761  if (clear_pc) {
762    movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
763  }
764}
765
766void MacroAssembler::set_last_Java_frame(Register last_java_sp,
767                                         Register last_java_fp,
768                                         address  last_java_pc) {
769  // determine last_java_sp register
770  if (!last_java_sp->is_valid()) {
771    last_java_sp = rsp;
772  }
773
774  // last_java_fp is optional
775  if (last_java_fp->is_valid()) {
776    movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
777           last_java_fp);
778  }
779
780  // last_java_pc is optional
781  if (last_java_pc != NULL) {
782    Address java_pc(r15_thread,
783                    JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
784    lea(rscratch1, InternalAddress(last_java_pc));
785    movptr(java_pc, rscratch1);
786  }
787
788  movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
789}
790
791static void pass_arg0(MacroAssembler* masm, Register arg) {
792  if (c_rarg0 != arg ) {
793    masm->mov(c_rarg0, arg);
794  }
795}
796
797static void pass_arg1(MacroAssembler* masm, Register arg) {
798  if (c_rarg1 != arg ) {
799    masm->mov(c_rarg1, arg);
800  }
801}
802
803static void pass_arg2(MacroAssembler* masm, Register arg) {
804  if (c_rarg2 != arg ) {
805    masm->mov(c_rarg2, arg);
806  }
807}
808
809static void pass_arg3(MacroAssembler* masm, Register arg) {
810  if (c_rarg3 != arg ) {
811    masm->mov(c_rarg3, arg);
812  }
813}
814
815void MacroAssembler::stop(const char* msg) {
816  address rip = pc();
817  pusha(); // get regs on stack
818  lea(c_rarg0, ExternalAddress((address) msg));
819  lea(c_rarg1, InternalAddress(rip));
820  movq(c_rarg2, rsp); // pass pointer to regs array
821  andq(rsp, -16); // align stack as required by ABI
822  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
823  hlt();
824}
825
826void MacroAssembler::warn(const char* msg) {
827  push(rbp);
828  movq(rbp, rsp);
829  andq(rsp, -16);     // align stack as required by push_CPU_state and call
830  push_CPU_state();   // keeps alignment at 16 bytes
831  lea(c_rarg0, ExternalAddress((address) msg));
832  call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0);
833  pop_CPU_state();
834  mov(rsp, rbp);
835  pop(rbp);
836}
837
838void MacroAssembler::print_state() {
839  address rip = pc();
840  pusha();            // get regs on stack
841  push(rbp);
842  movq(rbp, rsp);
843  andq(rsp, -16);     // align stack as required by push_CPU_state and call
844  push_CPU_state();   // keeps alignment at 16 bytes
845
846  lea(c_rarg0, InternalAddress(rip));
847  lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
848  call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
849
850  pop_CPU_state();
851  mov(rsp, rbp);
852  pop(rbp);
853  popa();
854}
855
856#ifndef PRODUCT
857extern "C" void findpc(intptr_t x);
858#endif
859
860void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
861  // In order to get locks to work, we need to fake a in_VM state
862  if (ShowMessageBoxOnError) {
863    JavaThread* thread = JavaThread::current();
864    JavaThreadState saved_state = thread->thread_state();
865    thread->set_thread_state(_thread_in_vm);
866#ifndef PRODUCT
867    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
868      ttyLocker ttyl;
869      BytecodeCounter::print();
870    }
871#endif
872    // To see where a verify_oop failed, get $ebx+40/X for this frame.
873    // XXX correct this offset for amd64
874    // This is the value of eip which points to where verify_oop will return.
875    if (os::message_box(msg, "Execution stopped, print registers?")) {
876      print_state64(pc, regs);
877      BREAKPOINT;
878      assert(false, "start up GDB");
879    }
880    ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
881  } else {
882    ttyLocker ttyl;
883    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
884                    msg);
885    assert(false, err_msg("DEBUG MESSAGE: %s", msg));
886  }
887}
888
889void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
890  ttyLocker ttyl;
891  FlagSetting fs(Debugging, true);
892  tty->print_cr("rip = 0x%016lx", pc);
893#ifndef PRODUCT
894  tty->cr();
895  findpc(pc);
896  tty->cr();
897#endif
898#define PRINT_REG(rax, value) \
899  { tty->print("%s = ", #rax); os::print_location(tty, value); }
900  PRINT_REG(rax, regs[15]);
901  PRINT_REG(rbx, regs[12]);
902  PRINT_REG(rcx, regs[14]);
903  PRINT_REG(rdx, regs[13]);
904  PRINT_REG(rdi, regs[8]);
905  PRINT_REG(rsi, regs[9]);
906  PRINT_REG(rbp, regs[10]);
907  PRINT_REG(rsp, regs[11]);
908  PRINT_REG(r8 , regs[7]);
909  PRINT_REG(r9 , regs[6]);
910  PRINT_REG(r10, regs[5]);
911  PRINT_REG(r11, regs[4]);
912  PRINT_REG(r12, regs[3]);
913  PRINT_REG(r13, regs[2]);
914  PRINT_REG(r14, regs[1]);
915  PRINT_REG(r15, regs[0]);
916#undef PRINT_REG
917  // Print some words near top of staack.
918  int64_t* rsp = (int64_t*) regs[11];
919  int64_t* dump_sp = rsp;
920  for (int col1 = 0; col1 < 8; col1++) {
921    tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
922    os::print_location(tty, *dump_sp++);
923  }
924  for (int row = 0; row < 25; row++) {
925    tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
926    for (int col = 0; col < 4; col++) {
927      tty->print(" 0x%016lx", *dump_sp++);
928    }
929    tty->cr();
930  }
931  // Print some instructions around pc:
932  Disassembler::decode((address)pc-64, (address)pc);
933  tty->print_cr("--------");
934  Disassembler::decode((address)pc, (address)pc+32);
935}
936
937#endif // _LP64
938
939// Now versions that are common to 32/64 bit
940
941void MacroAssembler::addptr(Register dst, int32_t imm32) {
942  LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
943}
944
945void MacroAssembler::addptr(Register dst, Register src) {
946  LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
947}
948
949void MacroAssembler::addptr(Address dst, Register src) {
950  LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
951}
952
953void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
954  if (reachable(src)) {
955    Assembler::addsd(dst, as_Address(src));
956  } else {
957    lea(rscratch1, src);
958    Assembler::addsd(dst, Address(rscratch1, 0));
959  }
960}
961
962void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
963  if (reachable(src)) {
964    addss(dst, as_Address(src));
965  } else {
966    lea(rscratch1, src);
967    addss(dst, Address(rscratch1, 0));
968  }
969}
970
971void MacroAssembler::align(int modulus) {
972  if (offset() % modulus != 0) {
973    nop(modulus - (offset() % modulus));
974  }
975}
976
977void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
978  // Used in sign-masking with aligned address.
979  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
980  if (reachable(src)) {
981    Assembler::andpd(dst, as_Address(src));
982  } else {
983    lea(rscratch1, src);
984    Assembler::andpd(dst, Address(rscratch1, 0));
985  }
986}
987
988void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
989  // Used in sign-masking with aligned address.
990  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
991  if (reachable(src)) {
992    Assembler::andps(dst, as_Address(src));
993  } else {
994    lea(rscratch1, src);
995    Assembler::andps(dst, Address(rscratch1, 0));
996  }
997}
998
999void MacroAssembler::andptr(Register dst, int32_t imm32) {
1000  LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1001}
1002
1003void MacroAssembler::atomic_incl(Address counter_addr) {
1004  if (os::is_MP())
1005    lock();
1006  incrementl(counter_addr);
1007}
1008
1009void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1010  if (reachable(counter_addr)) {
1011    atomic_incl(as_Address(counter_addr));
1012  } else {
1013    lea(scr, counter_addr);
1014    atomic_incl(Address(scr, 0));
1015  }
1016}
1017
1018#ifdef _LP64
1019void MacroAssembler::atomic_incq(Address counter_addr) {
1020  if (os::is_MP())
1021    lock();
1022  incrementq(counter_addr);
1023}
1024
1025void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1026  if (reachable(counter_addr)) {
1027    atomic_incq(as_Address(counter_addr));
1028  } else {
1029    lea(scr, counter_addr);
1030    atomic_incq(Address(scr, 0));
1031  }
1032}
1033#endif
1034
1035// Writes to stack successive pages until offset reached to check for
1036// stack overflow + shadow pages.  This clobbers tmp.
1037void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1038  movptr(tmp, rsp);
1039  // Bang stack for total size given plus shadow page size.
1040  // Bang one page at a time because large size can bang beyond yellow and
1041  // red zones.
1042  Label loop;
1043  bind(loop);
1044  movl(Address(tmp, (-os::vm_page_size())), size );
1045  subptr(tmp, os::vm_page_size());
1046  subl(size, os::vm_page_size());
1047  jcc(Assembler::greater, loop);
1048
1049  // Bang down shadow pages too.
1050  // At this point, (tmp-0) is the last address touched, so don't
1051  // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1052  // was post-decremented.)  Skip this address by starting at i=1, and
1053  // touch a few more pages below.  N.B.  It is important to touch all
1054  // the way down to and including i=StackShadowPages.
1055  for (int i = 1; i < StackShadowPages; i++) {
1056    // this could be any sized move but this is can be a debugging crumb
1057    // so the bigger the better.
1058    movptr(Address(tmp, (-i*os::vm_page_size())), size );
1059  }
1060}
1061
1062int MacroAssembler::biased_locking_enter(Register lock_reg,
1063                                         Register obj_reg,
1064                                         Register swap_reg,
1065                                         Register tmp_reg,
1066                                         bool swap_reg_contains_mark,
1067                                         Label& done,
1068                                         Label* slow_case,
1069                                         BiasedLockingCounters* counters) {
1070  assert(UseBiasedLocking, "why call this otherwise?");
1071  assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1072  LP64_ONLY( assert(tmp_reg != noreg, "tmp_reg must be supplied"); )
1073  bool need_tmp_reg = false;
1074  if (tmp_reg == noreg) {
1075    need_tmp_reg = true;
1076    tmp_reg = lock_reg;
1077    assert_different_registers(lock_reg, obj_reg, swap_reg);
1078  } else {
1079    assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1080  }
1081  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
1082  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1083  Address saved_mark_addr(lock_reg, 0);
1084
1085  if (PrintBiasedLockingStatistics && counters == NULL) {
1086    counters = BiasedLocking::counters();
1087  }
1088  // Biased locking
1089  // See whether the lock is currently biased toward our thread and
1090  // whether the epoch is still valid
1091  // Note that the runtime guarantees sufficient alignment of JavaThread
1092  // pointers to allow age to be placed into low bits
1093  // First check to see whether biasing is even enabled for this object
1094  Label cas_label;
1095  int null_check_offset = -1;
1096  if (!swap_reg_contains_mark) {
1097    null_check_offset = offset();
1098    movptr(swap_reg, mark_addr);
1099  }
1100  if (need_tmp_reg) {
1101    push(tmp_reg);
1102  }
1103  movptr(tmp_reg, swap_reg);
1104  andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
1105  cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
1106  if (need_tmp_reg) {
1107    pop(tmp_reg);
1108  }
1109  jcc(Assembler::notEqual, cas_label);
1110  // The bias pattern is present in the object's header. Need to check
1111  // whether the bias owner and the epoch are both still current.
1112#ifndef _LP64
1113  // Note that because there is no current thread register on x86_32 we
1114  // need to store off the mark word we read out of the object to
1115  // avoid reloading it and needing to recheck invariants below. This
1116  // store is unfortunate but it makes the overall code shorter and
1117  // simpler.
1118  movptr(saved_mark_addr, swap_reg);
1119#endif
1120  if (need_tmp_reg) {
1121    push(tmp_reg);
1122  }
1123  if (swap_reg_contains_mark) {
1124    null_check_offset = offset();
1125  }
1126  load_prototype_header(tmp_reg, obj_reg);
1127#ifdef _LP64
1128  orptr(tmp_reg, r15_thread);
1129  xorptr(tmp_reg, swap_reg);
1130  Register header_reg = tmp_reg;
1131#else
1132  xorptr(tmp_reg, swap_reg);
1133  get_thread(swap_reg);
1134  xorptr(swap_reg, tmp_reg);
1135  Register header_reg = swap_reg;
1136#endif
1137  andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
1138  if (need_tmp_reg) {
1139    pop(tmp_reg);
1140  }
1141  if (counters != NULL) {
1142    cond_inc32(Assembler::zero,
1143               ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1144  }
1145  jcc(Assembler::equal, done);
1146
1147  Label try_revoke_bias;
1148  Label try_rebias;
1149
1150  // At this point we know that the header has the bias pattern and
1151  // that we are not the bias owner in the current epoch. We need to
1152  // figure out more details about the state of the header in order to
1153  // know what operations can be legally performed on the object's
1154  // header.
1155
1156  // If the low three bits in the xor result aren't clear, that means
1157  // the prototype header is no longer biased and we have to revoke
1158  // the bias on this object.
1159  testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
1160  jccb(Assembler::notZero, try_revoke_bias);
1161
1162  // Biasing is still enabled for this data type. See whether the
1163  // epoch of the current bias is still valid, meaning that the epoch
1164  // bits of the mark word are equal to the epoch bits of the
1165  // prototype header. (Note that the prototype header's epoch bits
1166  // only change at a safepoint.) If not, attempt to rebias the object
1167  // toward the current thread. Note that we must be absolutely sure
1168  // that the current epoch is invalid in order to do this because
1169  // otherwise the manipulations it performs on the mark word are
1170  // illegal.
1171  testptr(header_reg, markOopDesc::epoch_mask_in_place);
1172  jccb(Assembler::notZero, try_rebias);
1173
1174  // The epoch of the current bias is still valid but we know nothing
1175  // about the owner; it might be set or it might be clear. Try to
1176  // acquire the bias of the object using an atomic operation. If this
1177  // fails we will go in to the runtime to revoke the object's bias.
1178  // Note that we first construct the presumed unbiased header so we
1179  // don't accidentally blow away another thread's valid bias.
1180  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1181  andptr(swap_reg,
1182         markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
1183  if (need_tmp_reg) {
1184    push(tmp_reg);
1185  }
1186#ifdef _LP64
1187  movptr(tmp_reg, swap_reg);
1188  orptr(tmp_reg, r15_thread);
1189#else
1190  get_thread(tmp_reg);
1191  orptr(tmp_reg, swap_reg);
1192#endif
1193  if (os::is_MP()) {
1194    lock();
1195  }
1196  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1197  if (need_tmp_reg) {
1198    pop(tmp_reg);
1199  }
1200  // If the biasing toward our thread failed, this means that
1201  // another thread succeeded in biasing it toward itself and we
1202  // need to revoke that bias. The revocation will occur in the
1203  // interpreter runtime in the slow case.
1204  if (counters != NULL) {
1205    cond_inc32(Assembler::zero,
1206               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1207  }
1208  if (slow_case != NULL) {
1209    jcc(Assembler::notZero, *slow_case);
1210  }
1211  jmp(done);
1212
1213  bind(try_rebias);
1214  // At this point we know the epoch has expired, meaning that the
1215  // current "bias owner", if any, is actually invalid. Under these
1216  // circumstances _only_, we are allowed to use the current header's
1217  // value as the comparison value when doing the cas to acquire the
1218  // bias in the current epoch. In other words, we allow transfer of
1219  // the bias from one thread to another directly in this situation.
1220  //
1221  // FIXME: due to a lack of registers we currently blow away the age
1222  // bits in this situation. Should attempt to preserve them.
1223  if (need_tmp_reg) {
1224    push(tmp_reg);
1225  }
1226  load_prototype_header(tmp_reg, obj_reg);
1227#ifdef _LP64
1228  orptr(tmp_reg, r15_thread);
1229#else
1230  get_thread(swap_reg);
1231  orptr(tmp_reg, swap_reg);
1232  movptr(swap_reg, saved_mark_addr);
1233#endif
1234  if (os::is_MP()) {
1235    lock();
1236  }
1237  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1238  if (need_tmp_reg) {
1239    pop(tmp_reg);
1240  }
1241  // If the biasing toward our thread failed, then another thread
1242  // succeeded in biasing it toward itself and we need to revoke that
1243  // bias. The revocation will occur in the runtime in the slow case.
1244  if (counters != NULL) {
1245    cond_inc32(Assembler::zero,
1246               ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1247  }
1248  if (slow_case != NULL) {
1249    jcc(Assembler::notZero, *slow_case);
1250  }
1251  jmp(done);
1252
1253  bind(try_revoke_bias);
1254  // The prototype mark in the klass doesn't have the bias bit set any
1255  // more, indicating that objects of this data type are not supposed
1256  // to be biased any more. We are going to try to reset the mark of
1257  // this object to the prototype value and fall through to the
1258  // CAS-based locking scheme. Note that if our CAS fails, it means
1259  // that another thread raced us for the privilege of revoking the
1260  // bias of this particular object, so it's okay to continue in the
1261  // normal locking code.
1262  //
1263  // FIXME: due to a lack of registers we currently blow away the age
1264  // bits in this situation. Should attempt to preserve them.
1265  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1266  if (need_tmp_reg) {
1267    push(tmp_reg);
1268  }
1269  load_prototype_header(tmp_reg, obj_reg);
1270  if (os::is_MP()) {
1271    lock();
1272  }
1273  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1274  if (need_tmp_reg) {
1275    pop(tmp_reg);
1276  }
1277  // Fall through to the normal CAS-based lock, because no matter what
1278  // the result of the above CAS, some thread must have succeeded in
1279  // removing the bias bit from the object's header.
1280  if (counters != NULL) {
1281    cond_inc32(Assembler::zero,
1282               ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1283  }
1284
1285  bind(cas_label);
1286
1287  return null_check_offset;
1288}
1289
1290void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1291  assert(UseBiasedLocking, "why call this otherwise?");
1292
1293  // Check for biased locking unlock case, which is a no-op
1294  // Note: we do not have to check the thread ID for two reasons.
1295  // First, the interpreter checks for IllegalMonitorStateException at
1296  // a higher level. Second, if the bias was revoked while we held the
1297  // lock, the object could not be rebiased toward another thread, so
1298  // the bias bit would be clear.
1299  movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1300  andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
1301  cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
1302  jcc(Assembler::equal, done);
1303}
1304
1305#ifdef COMPILER2
1306
1307#if INCLUDE_RTM_OPT
1308
1309// Update rtm_counters based on abort status
1310// input: abort_status
1311//        rtm_counters (RTMLockingCounters*)
1312// flags are killed
1313void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1314
1315  atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1316  if (PrintPreciseRTMLockingStatistics) {
1317    for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1318      Label check_abort;
1319      testl(abort_status, (1<<i));
1320      jccb(Assembler::equal, check_abort);
1321      atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1322      bind(check_abort);
1323    }
1324  }
1325}
1326
1327// Branch if (random & (count-1) != 0), count is 2^n
1328// tmp, scr and flags are killed
1329void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
1330  assert(tmp == rax, "");
1331  assert(scr == rdx, "");
1332  rdtsc(); // modifies EDX:EAX
1333  andptr(tmp, count-1);
1334  jccb(Assembler::notZero, brLabel);
1335}
1336
1337// Perform abort ratio calculation, set no_rtm bit if high ratio
1338// input:  rtm_counters_Reg (RTMLockingCounters* address)
1339// tmpReg, rtm_counters_Reg and flags are killed
1340void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
1341                                                 Register rtm_counters_Reg,
1342                                                 RTMLockingCounters* rtm_counters,
1343                                                 Metadata* method_data) {
1344  Label L_done, L_check_always_rtm1, L_check_always_rtm2;
1345
1346  if (RTMLockingCalculationDelay > 0) {
1347    // Delay calculation
1348    movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
1349    testptr(tmpReg, tmpReg);
1350    jccb(Assembler::equal, L_done);
1351  }
1352  // Abort ratio calculation only if abort_count > RTMAbortThreshold
1353  //   Aborted transactions = abort_count * 100
1354  //   All transactions = total_count *  RTMTotalCountIncrRate
1355  //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
1356
1357  movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
1358  cmpptr(tmpReg, RTMAbortThreshold);
1359  jccb(Assembler::below, L_check_always_rtm2);
1360  imulptr(tmpReg, tmpReg, 100);
1361
1362  Register scrReg = rtm_counters_Reg;
1363  movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1364  imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
1365  imulptr(scrReg, scrReg, RTMAbortRatio);
1366  cmpptr(tmpReg, scrReg);
1367  jccb(Assembler::below, L_check_always_rtm1);
1368  if (method_data != NULL) {
1369    // set rtm_state to "no rtm" in MDO
1370    mov_metadata(tmpReg, method_data);
1371    if (os::is_MP()) {
1372      lock();
1373    }
1374    orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
1375  }
1376  jmpb(L_done);
1377  bind(L_check_always_rtm1);
1378  // Reload RTMLockingCounters* address
1379  lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1380  bind(L_check_always_rtm2);
1381  movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1382  cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
1383  jccb(Assembler::below, L_done);
1384  if (method_data != NULL) {
1385    // set rtm_state to "always rtm" in MDO
1386    mov_metadata(tmpReg, method_data);
1387    if (os::is_MP()) {
1388      lock();
1389    }
1390    orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
1391  }
1392  bind(L_done);
1393}
1394
1395// Update counters and perform abort ratio calculation
1396// input:  abort_status_Reg
1397// rtm_counters_Reg, flags are killed
1398void MacroAssembler::rtm_profiling(Register abort_status_Reg,
1399                                   Register rtm_counters_Reg,
1400                                   RTMLockingCounters* rtm_counters,
1401                                   Metadata* method_data,
1402                                   bool profile_rtm) {
1403
1404  assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1405  // update rtm counters based on rax value at abort
1406  // reads abort_status_Reg, updates flags
1407  lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1408  rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
1409  if (profile_rtm) {
1410    // Save abort status because abort_status_Reg is used by following code.
1411    if (RTMRetryCount > 0) {
1412      push(abort_status_Reg);
1413    }
1414    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1415    rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
1416    // restore abort status
1417    if (RTMRetryCount > 0) {
1418      pop(abort_status_Reg);
1419    }
1420  }
1421}
1422
1423// Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
1424// inputs: retry_count_Reg
1425//       : abort_status_Reg
1426// output: retry_count_Reg decremented by 1
1427// flags are killed
1428void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
1429  Label doneRetry;
1430  assert(abort_status_Reg == rax, "");
1431  // The abort reason bits are in eax (see all states in rtmLocking.hpp)
1432  // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
1433  // if reason is in 0x6 and retry count != 0 then retry
1434  andptr(abort_status_Reg, 0x6);
1435  jccb(Assembler::zero, doneRetry);
1436  testl(retry_count_Reg, retry_count_Reg);
1437  jccb(Assembler::zero, doneRetry);
1438  pause();
1439  decrementl(retry_count_Reg);
1440  jmp(retryLabel);
1441  bind(doneRetry);
1442}
1443
1444// Spin and retry if lock is busy,
1445// inputs: box_Reg (monitor address)
1446//       : retry_count_Reg
1447// output: retry_count_Reg decremented by 1
1448//       : clear z flag if retry count exceeded
1449// tmp_Reg, scr_Reg, flags are killed
1450void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
1451                                            Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
1452  Label SpinLoop, SpinExit, doneRetry;
1453  // Clean monitor_value bit to get valid pointer
1454  int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
1455
1456  testl(retry_count_Reg, retry_count_Reg);
1457  jccb(Assembler::zero, doneRetry);
1458  decrementl(retry_count_Reg);
1459  movptr(scr_Reg, RTMSpinLoopCount);
1460
1461  bind(SpinLoop);
1462  pause();
1463  decrementl(scr_Reg);
1464  jccb(Assembler::lessEqual, SpinExit);
1465  movptr(tmp_Reg, Address(box_Reg, owner_offset));
1466  testptr(tmp_Reg, tmp_Reg);
1467  jccb(Assembler::notZero, SpinLoop);
1468
1469  bind(SpinExit);
1470  jmp(retryLabel);
1471  bind(doneRetry);
1472  incrementl(retry_count_Reg); // clear z flag
1473}
1474
1475// Use RTM for normal stack locks
1476// Input: objReg (object to lock)
1477void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
1478                                       Register retry_on_abort_count_Reg,
1479                                       RTMLockingCounters* stack_rtm_counters,
1480                                       Metadata* method_data, bool profile_rtm,
1481                                       Label& DONE_LABEL, Label& IsInflated) {
1482  assert(UseRTMForStackLocks, "why call this otherwise?");
1483  assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1484  assert(tmpReg == rax, "");
1485  assert(scrReg == rdx, "");
1486  Label L_rtm_retry, L_decrement_retry, L_on_abort;
1487
1488  if (RTMRetryCount > 0) {
1489    movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1490    bind(L_rtm_retry);
1491  }
1492  movptr(tmpReg, Address(objReg, 0));
1493  testptr(tmpReg, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
1494  jcc(Assembler::notZero, IsInflated);
1495
1496  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1497    Label L_noincrement;
1498    if (RTMTotalCountIncrRate > 1) {
1499      // tmpReg, scrReg and flags are killed
1500      branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
1501    }
1502    assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
1503    atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
1504    bind(L_noincrement);
1505  }
1506  xbegin(L_on_abort);
1507  movptr(tmpReg, Address(objReg, 0));       // fetch markword
1508  andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1509  cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1510  jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
1511
1512  Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1513  if (UseRTMXendForLockBusy) {
1514    xend();
1515    movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
1516    jmp(L_decrement_retry);
1517  }
1518  else {
1519    xabort(0);
1520  }
1521  bind(L_on_abort);
1522  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1523    rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
1524  }
1525  bind(L_decrement_retry);
1526  if (RTMRetryCount > 0) {
1527    // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1528    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1529  }
1530}
1531
1532// Use RTM for inflating locks
1533// inputs: objReg (object to lock)
1534//         boxReg (on-stack box address (displaced header location) - KILLED)
1535//         tmpReg (ObjectMonitor address + 2(monitor_value))
1536void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1537                                          Register scrReg, Register retry_on_busy_count_Reg,
1538                                          Register retry_on_abort_count_Reg,
1539                                          RTMLockingCounters* rtm_counters,
1540                                          Metadata* method_data, bool profile_rtm,
1541                                          Label& DONE_LABEL) {
1542  assert(UseRTMLocking, "why call this otherwise?");
1543  assert(tmpReg == rax, "");
1544  assert(scrReg == rdx, "");
1545  Label L_rtm_retry, L_decrement_retry, L_on_abort;
1546  // Clean monitor_value bit to get valid pointer
1547  int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
1548
1549  // Without cast to int32_t a movptr will destroy r10 which is typically obj
1550  movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1551  movptr(boxReg, tmpReg); // Save ObjectMonitor address
1552
1553  if (RTMRetryCount > 0) {
1554    movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
1555    movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1556    bind(L_rtm_retry);
1557  }
1558  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1559    Label L_noincrement;
1560    if (RTMTotalCountIncrRate > 1) {
1561      // tmpReg, scrReg and flags are killed
1562      branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
1563    }
1564    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1565    atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1566    bind(L_noincrement);
1567  }
1568  xbegin(L_on_abort);
1569  movptr(tmpReg, Address(objReg, 0));
1570  movptr(tmpReg, Address(tmpReg, owner_offset));
1571  testptr(tmpReg, tmpReg);
1572  jcc(Assembler::zero, DONE_LABEL);
1573  if (UseRTMXendForLockBusy) {
1574    xend();
1575    jmp(L_decrement_retry);
1576  }
1577  else {
1578    xabort(0);
1579  }
1580  bind(L_on_abort);
1581  Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1582  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1583    rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1584  }
1585  if (RTMRetryCount > 0) {
1586    // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1587    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1588  }
1589
1590  movptr(tmpReg, Address(boxReg, owner_offset)) ;
1591  testptr(tmpReg, tmpReg) ;
1592  jccb(Assembler::notZero, L_decrement_retry) ;
1593
1594  // Appears unlocked - try to swing _owner from null to non-null.
1595  // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1596#ifdef _LP64
1597  Register threadReg = r15_thread;
1598#else
1599  get_thread(scrReg);
1600  Register threadReg = scrReg;
1601#endif
1602  if (os::is_MP()) {
1603    lock();
1604  }
1605  cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1606
1607  if (RTMRetryCount > 0) {
1608    // success done else retry
1609    jccb(Assembler::equal, DONE_LABEL) ;
1610    bind(L_decrement_retry);
1611    // Spin and retry if lock is busy.
1612    rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1613  }
1614  else {
1615    bind(L_decrement_retry);
1616  }
1617}
1618
1619#endif //  INCLUDE_RTM_OPT
1620
1621// Fast_Lock and Fast_Unlock used by C2
1622
1623// Because the transitions from emitted code to the runtime
1624// monitorenter/exit helper stubs are so slow it's critical that
1625// we inline both the stack-locking fast-path and the inflated fast path.
1626//
1627// See also: cmpFastLock and cmpFastUnlock.
1628//
1629// What follows is a specialized inline transliteration of the code
1630// in slow_enter() and slow_exit().  If we're concerned about I$ bloat
1631// another option would be to emit TrySlowEnter and TrySlowExit methods
1632// at startup-time.  These methods would accept arguments as
1633// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1634// indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
1635// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1636// In practice, however, the # of lock sites is bounded and is usually small.
1637// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1638// if the processor uses simple bimodal branch predictors keyed by EIP
1639// Since the helper routines would be called from multiple synchronization
1640// sites.
1641//
1642// An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1643// in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1644// to those specialized methods.  That'd give us a mostly platform-independent
1645// implementation that the JITs could optimize and inline at their pleasure.
1646// Done correctly, the only time we'd need to cross to native could would be
1647// to park() or unpark() threads.  We'd also need a few more unsafe operators
1648// to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1649// (b) explicit barriers or fence operations.
1650//
1651// TODO:
1652//
1653// *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1654//    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1655//    Given TLAB allocation, Self is usually manifested in a register, so passing it into
1656//    the lock operators would typically be faster than reifying Self.
1657//
1658// *  Ideally I'd define the primitives as:
1659//       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1660//       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1661//    Unfortunately ADLC bugs prevent us from expressing the ideal form.
1662//    Instead, we're stuck with a rather awkward and brittle register assignments below.
1663//    Furthermore the register assignments are overconstrained, possibly resulting in
1664//    sub-optimal code near the synchronization site.
1665//
1666// *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
1667//    Alternately, use a better sp-proximity test.
1668//
1669// *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1670//    Either one is sufficient to uniquely identify a thread.
1671//    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1672//
1673// *  Intrinsify notify() and notifyAll() for the common cases where the
1674//    object is locked by the calling thread but the waitlist is empty.
1675//    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1676//
1677// *  use jccb and jmpb instead of jcc and jmp to improve code density.
1678//    But beware of excessive branch density on AMD Opterons.
1679//
1680// *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1681//    or failure of the fast-path.  If the fast-path fails then we pass
1682//    control to the slow-path, typically in C.  In Fast_Lock and
1683//    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1684//    will emit a conditional branch immediately after the node.
1685//    So we have branches to branches and lots of ICC.ZF games.
1686//    Instead, it might be better to have C2 pass a "FailureLabel"
1687//    into Fast_Lock and Fast_Unlock.  In the case of success, control
1688//    will drop through the node.  ICC.ZF is undefined at exit.
1689//    In the case of failure, the node will branch directly to the
1690//    FailureLabel
1691
1692
1693// obj: object to lock
1694// box: on-stack box address (displaced header location) - KILLED
1695// rax,: tmp -- KILLED
1696// scr: tmp -- KILLED
1697void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1698                               Register scrReg, Register cx1Reg, Register cx2Reg,
1699                               BiasedLockingCounters* counters,
1700                               RTMLockingCounters* rtm_counters,
1701                               RTMLockingCounters* stack_rtm_counters,
1702                               Metadata* method_data,
1703                               bool use_rtm, bool profile_rtm) {
1704  // Ensure the register assignents are disjoint
1705  assert(tmpReg == rax, "");
1706
1707  if (use_rtm) {
1708    assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1709  } else {
1710    assert(cx1Reg == noreg, "");
1711    assert(cx2Reg == noreg, "");
1712    assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1713  }
1714
1715  if (counters != NULL) {
1716    atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1717  }
1718  if (EmitSync & 1) {
1719      // set box->dhw = unused_mark (3)
1720      // Force all sync thru slow-path: slow_enter() and slow_exit()
1721      movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1722      cmpptr (rsp, (int32_t)NULL_WORD);
1723  } else
1724  if (EmitSync & 2) {
1725      Label DONE_LABEL ;
1726      if (UseBiasedLocking) {
1727         // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
1728         biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1729      }
1730
1731      movptr(tmpReg, Address(objReg, 0));           // fetch markword
1732      orptr (tmpReg, 0x1);
1733      movptr(Address(boxReg, 0), tmpReg);           // Anticipate successful CAS
1734      if (os::is_MP()) {
1735        lock();
1736      }
1737      cmpxchgptr(boxReg, Address(objReg, 0));       // Updates tmpReg
1738      jccb(Assembler::equal, DONE_LABEL);
1739      // Recursive locking
1740      subptr(tmpReg, rsp);
1741      andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1742      movptr(Address(boxReg, 0), tmpReg);
1743      bind(DONE_LABEL);
1744  } else {
1745    // Possible cases that we'll encounter in fast_lock
1746    // ------------------------------------------------
1747    // * Inflated
1748    //    -- unlocked
1749    //    -- Locked
1750    //       = by self
1751    //       = by other
1752    // * biased
1753    //    -- by Self
1754    //    -- by other
1755    // * neutral
1756    // * stack-locked
1757    //    -- by self
1758    //       = sp-proximity test hits
1759    //       = sp-proximity test generates false-negative
1760    //    -- by other
1761    //
1762
1763    Label IsInflated, DONE_LABEL;
1764
1765    // it's stack-locked, biased or neutral
1766    // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1767    // order to reduce the number of conditional branches in the most common cases.
1768    // Beware -- there's a subtle invariant that fetch of the markword
1769    // at [FETCH], below, will never observe a biased encoding (*101b).
1770    // If this invariant is not held we risk exclusion (safety) failure.
1771    if (UseBiasedLocking && !UseOptoBiasInlining) {
1772      biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, counters);
1773    }
1774
1775#if INCLUDE_RTM_OPT
1776    if (UseRTMForStackLocks && use_rtm) {
1777      rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
1778                        stack_rtm_counters, method_data, profile_rtm,
1779                        DONE_LABEL, IsInflated);
1780    }
1781#endif // INCLUDE_RTM_OPT
1782
1783    movptr(tmpReg, Address(objReg, 0));          // [FETCH]
1784    testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1785    jccb(Assembler::notZero, IsInflated);
1786
1787    // Attempt stack-locking ...
1788    orptr (tmpReg, markOopDesc::unlocked_value);
1789    movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
1790    if (os::is_MP()) {
1791      lock();
1792    }
1793    cmpxchgptr(boxReg, Address(objReg, 0));      // Updates tmpReg
1794    if (counters != NULL) {
1795      cond_inc32(Assembler::equal,
1796                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1797    }
1798    jcc(Assembler::equal, DONE_LABEL);           // Success
1799
1800    // Recursive locking.
1801    // The object is stack-locked: markword contains stack pointer to BasicLock.
1802    // Locked by current thread if difference with current SP is less than one page.
1803    subptr(tmpReg, rsp);
1804    // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
1805    andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1806    movptr(Address(boxReg, 0), tmpReg);
1807    if (counters != NULL) {
1808      cond_inc32(Assembler::equal,
1809                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1810    }
1811    jmp(DONE_LABEL);
1812
1813    bind(IsInflated);
1814    // The object is inflated. tmpReg contains pointer to ObjectMonitor* + 2(monitor_value)
1815
1816#if INCLUDE_RTM_OPT
1817    // Use the same RTM locking code in 32- and 64-bit VM.
1818    if (use_rtm) {
1819      rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1820                           rtm_counters, method_data, profile_rtm, DONE_LABEL);
1821    } else {
1822#endif // INCLUDE_RTM_OPT
1823
1824#ifndef _LP64
1825    // The object is inflated.
1826    //
1827    // TODO-FIXME: eliminate the ugly use of manifest constants:
1828    //   Use markOopDesc::monitor_value instead of "2".
1829    //   use markOop::unused_mark() instead of "3".
1830    // The tmpReg value is an objectMonitor reference ORed with
1831    // markOopDesc::monitor_value (2).   We can either convert tmpReg to an
1832    // objectmonitor pointer by masking off the "2" bit or we can just
1833    // use tmpReg as an objectmonitor pointer but bias the objectmonitor
1834    // field offsets with "-2" to compensate for and annul the low-order tag bit.
1835    //
1836    // I use the latter as it avoids AGI stalls.
1837    // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
1838    // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
1839    //
1840    #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
1841
1842    // boxReg refers to the on-stack BasicLock in the current frame.
1843    // We'd like to write:
1844    //   set box->_displaced_header = markOop::unused_mark().  Any non-0 value suffices.
1845    // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
1846    // additional latency as we have another ST in the store buffer that must drain.
1847
1848    if (EmitSync & 8192) {
1849       movptr(Address(boxReg, 0), 3);            // results in ST-before-CAS penalty
1850       get_thread (scrReg);
1851       movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2]
1852       movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
1853       if (os::is_MP()) {
1854         lock();
1855       }
1856       cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1857    } else
1858    if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
1859       movptr(scrReg, boxReg);
1860       movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
1861
1862       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1863       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1864          // prefetchw [eax + Offset(_owner)-2]
1865          prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1866       }
1867
1868       if ((EmitSync & 64) == 0) {
1869         // Optimistic form: consider XORL tmpReg,tmpReg
1870         movptr(tmpReg, NULL_WORD);
1871       } else {
1872         // Can suffer RTS->RTO upgrades on shared or cold $ lines
1873         // Test-And-CAS instead of CAS
1874         movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));   // rax, = m->_owner
1875         testptr(tmpReg, tmpReg);                   // Locked ?
1876         jccb  (Assembler::notZero, DONE_LABEL);
1877       }
1878
1879       // Appears unlocked - try to swing _owner from null to non-null.
1880       // Ideally, I'd manifest "Self" with get_thread and then attempt
1881       // to CAS the register containing Self into m->Owner.
1882       // But we don't have enough registers, so instead we can either try to CAS
1883       // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1884       // we later store "Self" into m->Owner.  Transiently storing a stack address
1885       // (rsp or the address of the box) into  m->owner is harmless.
1886       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1887       if (os::is_MP()) {
1888         lock();
1889       }
1890       cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1891       movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1892       jccb  (Assembler::notZero, DONE_LABEL);
1893       get_thread (scrReg);                    // beware: clobbers ICCs
1894       movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg);
1895       xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1896
1897       // If the CAS fails we can either retry or pass control to the slow-path.
1898       // We use the latter tactic.
1899       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1900       // If the CAS was successful ...
1901       //   Self has acquired the lock
1902       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1903       // Intentional fall-through into DONE_LABEL ...
1904    } else {
1905       movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark()));  // results in ST-before-CAS penalty
1906       movptr(boxReg, tmpReg);
1907
1908       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1909       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1910          // prefetchw [eax + Offset(_owner)-2]
1911          prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1912       }
1913
1914       if ((EmitSync & 64) == 0) {
1915         // Optimistic form
1916         xorptr  (tmpReg, tmpReg);
1917       } else {
1918         // Can suffer RTS->RTO upgrades on shared or cold $ lines
1919         movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));   // rax, = m->_owner
1920         testptr(tmpReg, tmpReg);                   // Locked ?
1921         jccb  (Assembler::notZero, DONE_LABEL);
1922       }
1923
1924       // Appears unlocked - try to swing _owner from null to non-null.
1925       // Use either "Self" (in scr) or rsp as thread identity in _owner.
1926       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1927       get_thread (scrReg);
1928       if (os::is_MP()) {
1929         lock();
1930       }
1931       cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1932
1933       // If the CAS fails we can either retry or pass control to the slow-path.
1934       // We use the latter tactic.
1935       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1936       // If the CAS was successful ...
1937       //   Self has acquired the lock
1938       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1939       // Intentional fall-through into DONE_LABEL ...
1940    }
1941#else // _LP64
1942    // It's inflated
1943
1944    // TODO: someday avoid the ST-before-CAS penalty by
1945    // relocating (deferring) the following ST.
1946    // We should also think about trying a CAS without having
1947    // fetched _owner.  If the CAS is successful we may
1948    // avoid an RTO->RTS upgrade on the $line.
1949
1950    // Without cast to int32_t a movptr will destroy r10 which is typically obj
1951    movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1952
1953    movptr (boxReg, tmpReg);
1954    movptr (tmpReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1955    testptr(tmpReg, tmpReg);
1956    jccb   (Assembler::notZero, DONE_LABEL);
1957
1958    // It's inflated and appears unlocked
1959    if (os::is_MP()) {
1960      lock();
1961    }
1962    cmpxchgptr(r15_thread, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1963    // Intentional fall-through into DONE_LABEL ...
1964#endif // _LP64
1965
1966#if INCLUDE_RTM_OPT
1967    } // use_rtm()
1968#endif
1969    // DONE_LABEL is a hot target - we'd really like to place it at the
1970    // start of cache line by padding with NOPs.
1971    // See the AMD and Intel software optimization manuals for the
1972    // most efficient "long" NOP encodings.
1973    // Unfortunately none of our alignment mechanisms suffice.
1974    bind(DONE_LABEL);
1975
1976    // At DONE_LABEL the icc ZFlag is set as follows ...
1977    // Fast_Unlock uses the same protocol.
1978    // ZFlag == 1 -> Success
1979    // ZFlag == 0 -> Failure - force control through the slow-path
1980  }
1981}
1982
1983// obj: object to unlock
1984// box: box address (displaced header location), killed.  Must be EAX.
1985// tmp: killed, cannot be obj nor box.
1986//
1987// Some commentary on balanced locking:
1988//
1989// Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1990// Methods that don't have provably balanced locking are forced to run in the
1991// interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1992// The interpreter provides two properties:
1993// I1:  At return-time the interpreter automatically and quietly unlocks any
1994//      objects acquired the current activation (frame).  Recall that the
1995//      interpreter maintains an on-stack list of locks currently held by
1996//      a frame.
1997// I2:  If a method attempts to unlock an object that is not held by the
1998//      the frame the interpreter throws IMSX.
1999//
2000// Lets say A(), which has provably balanced locking, acquires O and then calls B().
2001// B() doesn't have provably balanced locking so it runs in the interpreter.
2002// Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
2003// is still locked by A().
2004//
2005// The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
2006// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
2007// should not be unlocked by "normal" java-level locking and vice-versa.  The specification
2008// doesn't specify what will occur if a program engages in such mixed-mode locking, however.
2009
2010void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
2011  assert(boxReg == rax, "");
2012  assert_different_registers(objReg, boxReg, tmpReg);
2013
2014  if (EmitSync & 4) {
2015    // Disable - inhibit all inlining.  Force control through the slow-path
2016    cmpptr (rsp, 0);
2017  } else
2018  if (EmitSync & 8) {
2019    Label DONE_LABEL;
2020    if (UseBiasedLocking) {
2021       biased_locking_exit(objReg, tmpReg, DONE_LABEL);
2022    }
2023    // Classic stack-locking code ...
2024    // Check whether the displaced header is 0
2025    //(=> recursive unlock)
2026    movptr(tmpReg, Address(boxReg, 0));
2027    testptr(tmpReg, tmpReg);
2028    jccb(Assembler::zero, DONE_LABEL);
2029    // If not recursive lock, reset the header to displaced header
2030    if (os::is_MP()) {
2031      lock();
2032    }
2033    cmpxchgptr(tmpReg, Address(objReg, 0));   // Uses RAX which is box
2034    bind(DONE_LABEL);
2035  } else {
2036    Label DONE_LABEL, Stacked, CheckSucc;
2037
2038    // Critically, the biased locking test must have precedence over
2039    // and appear before the (box->dhw == 0) recursive stack-lock test.
2040    if (UseBiasedLocking && !UseOptoBiasInlining) {
2041       biased_locking_exit(objReg, tmpReg, DONE_LABEL);
2042    }
2043
2044#if INCLUDE_RTM_OPT
2045    if (UseRTMForStackLocks && use_rtm) {
2046      assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2047      Label L_regular_unlock;
2048      movptr(tmpReg, Address(objReg, 0));           // fetch markword
2049      andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2050      cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
2051      jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
2052      xend();                                       // otherwise end...
2053      jmp(DONE_LABEL);                              // ... and we're done
2054      bind(L_regular_unlock);
2055    }
2056#endif
2057
2058    cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
2059    jcc   (Assembler::zero, DONE_LABEL);            // 0 indicates recursive stack-lock
2060    movptr(tmpReg, Address(objReg, 0));             // Examine the object's markword
2061    testptr(tmpReg, markOopDesc::monitor_value);    // Inflated?
2062    jccb  (Assembler::zero, Stacked);
2063
2064    // It's inflated.
2065#if INCLUDE_RTM_OPT
2066    if (use_rtm) {
2067      Label L_regular_inflated_unlock;
2068      // Clean monitor_value bit to get valid pointer
2069      int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2070      movptr(boxReg, Address(tmpReg, owner_offset));
2071      testptr(boxReg, boxReg);
2072      jccb(Assembler::notZero, L_regular_inflated_unlock);
2073      xend();
2074      jmpb(DONE_LABEL);
2075      bind(L_regular_inflated_unlock);
2076    }
2077#endif
2078
2079    // Despite our balanced locking property we still check that m->_owner == Self
2080    // as java routines or native JNI code called by this thread might
2081    // have released the lock.
2082    // Refer to the comments in synchronizer.cpp for how we might encode extra
2083    // state in _succ so we can avoid fetching EntryList|cxq.
2084    //
2085    // I'd like to add more cases in fast_lock() and fast_unlock() --
2086    // such as recursive enter and exit -- but we have to be wary of
2087    // I$ bloat, T$ effects and BP$ effects.
2088    //
2089    // If there's no contention try a 1-0 exit.  That is, exit without
2090    // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
2091    // we detect and recover from the race that the 1-0 exit admits.
2092    //
2093    // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
2094    // before it STs null into _owner, releasing the lock.  Updates
2095    // to data protected by the critical section must be visible before
2096    // we drop the lock (and thus before any other thread could acquire
2097    // the lock and observe the fields protected by the lock).
2098    // IA32's memory-model is SPO, so STs are ordered with respect to
2099    // each other and there's no need for an explicit barrier (fence).
2100    // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
2101#ifndef _LP64
2102    get_thread (boxReg);
2103    if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
2104      // prefetchw [ebx + Offset(_owner)-2]
2105      prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
2106    }
2107
2108    // Note that we could employ various encoding schemes to reduce
2109    // the number of loads below (currently 4) to just 2 or 3.
2110    // Refer to the comments in synchronizer.cpp.
2111    // In practice the chain of fetches doesn't seem to impact performance, however.
2112    if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
2113       // Attempt to reduce branch density - AMD's branch predictor.
2114       xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
2115       orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
2116       orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
2117       orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
2118       jccb  (Assembler::notZero, DONE_LABEL);
2119       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
2120       jmpb  (DONE_LABEL);
2121    } else {
2122       xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
2123       orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
2124       jccb  (Assembler::notZero, DONE_LABEL);
2125       movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
2126       orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
2127       jccb  (Assembler::notZero, CheckSucc);
2128       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
2129       jmpb  (DONE_LABEL);
2130    }
2131
2132    // The Following code fragment (EmitSync & 65536) improves the performance of
2133    // contended applications and contended synchronization microbenchmarks.
2134    // Unfortunately the emission of the code - even though not executed - causes regressions
2135    // in scimark and jetstream, evidently because of $ effects.  Replacing the code
2136    // with an equal number of never-executed NOPs results in the same regression.
2137    // We leave it off by default.
2138
2139    if ((EmitSync & 65536) != 0) {
2140       Label LSuccess, LGoSlowPath ;
2141
2142       bind  (CheckSucc);
2143
2144       // Optional pre-test ... it's safe to elide this
2145       if ((EmitSync & 16) == 0) {
2146          cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
2147          jccb  (Assembler::zero, LGoSlowPath);
2148       }
2149
2150       // We have a classic Dekker-style idiom:
2151       //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
2152       // There are a number of ways to implement the barrier:
2153       // (1) lock:andl &m->_owner, 0
2154       //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
2155       //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
2156       //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
2157       // (2) If supported, an explicit MFENCE is appealing.
2158       //     In older IA32 processors MFENCE is slower than lock:add or xchg
2159       //     particularly if the write-buffer is full as might be the case if
2160       //     if stores closely precede the fence or fence-equivalent instruction.
2161       //     In more modern implementations MFENCE appears faster, however.
2162       // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
2163       //     The $lines underlying the top-of-stack should be in M-state.
2164       //     The locked add instruction is serializing, of course.
2165       // (4) Use xchg, which is serializing
2166       //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
2167       // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
2168       //     The integer condition codes will tell us if succ was 0.
2169       //     Since _succ and _owner should reside in the same $line and
2170       //     we just stored into _owner, it's likely that the $line
2171       //     remains in M-state for the lock:orl.
2172       //
2173       // We currently use (3), although it's likely that switching to (2)
2174       // is correct for the future.
2175
2176       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
2177       if (os::is_MP()) {
2178          if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
2179            mfence();
2180          } else {
2181            lock (); addptr(Address(rsp, 0), 0);
2182          }
2183       }
2184       // Ratify _succ remains non-null
2185       cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0);
2186       jccb  (Assembler::notZero, LSuccess);
2187
2188       xorptr(boxReg, boxReg);                  // box is really EAX
2189       if (os::is_MP()) { lock(); }
2190       cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
2191       jccb  (Assembler::notEqual, LSuccess);
2192       // Since we're low on registers we installed rsp as a placeholding in _owner.
2193       // Now install Self over rsp.  This is safe as we're transitioning from
2194       // non-null to non=null
2195       get_thread (boxReg);
2196       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg);
2197       // Intentional fall-through into LGoSlowPath ...
2198
2199       bind  (LGoSlowPath);
2200       orptr(boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2201       jmpb  (DONE_LABEL);
2202
2203       bind  (LSuccess);
2204       xorptr(boxReg, boxReg);                 // set ICC.ZF=1 to indicate success
2205       jmpb  (DONE_LABEL);
2206    }
2207
2208    bind (Stacked);
2209    // It's not inflated and it's not recursively stack-locked and it's not biased.
2210    // It must be stack-locked.
2211    // Try to reset the header to displaced header.
2212    // The "box" value on the stack is stable, so we can reload
2213    // and be assured we observe the same value as above.
2214    movptr(tmpReg, Address(boxReg, 0));
2215    if (os::is_MP()) {
2216      lock();
2217    }
2218    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2219    // Intention fall-thru into DONE_LABEL
2220
2221    // DONE_LABEL is a hot target - we'd really like to place it at the
2222    // start of cache line by padding with NOPs.
2223    // See the AMD and Intel software optimization manuals for the
2224    // most efficient "long" NOP encodings.
2225    // Unfortunately none of our alignment mechanisms suffice.
2226    if ((EmitSync & 65536) == 0) {
2227       bind (CheckSucc);
2228    }
2229#else // _LP64
2230    // It's inflated
2231    movptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
2232    xorptr(boxReg, r15_thread);
2233    orptr (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
2234    jccb  (Assembler::notZero, DONE_LABEL);
2235    movptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
2236    orptr (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
2237    jccb  (Assembler::notZero, CheckSucc);
2238    movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
2239    jmpb  (DONE_LABEL);
2240
2241    if ((EmitSync & 65536) == 0) {
2242      Label LSuccess, LGoSlowPath ;
2243      bind  (CheckSucc);
2244      cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
2245      jccb  (Assembler::zero, LGoSlowPath);
2246
2247      // I'd much rather use lock:andl m->_owner, 0 as it's faster than the
2248      // the explicit ST;MEMBAR combination, but masm doesn't currently support
2249      // "ANDQ M,IMM".  Don't use MFENCE here.  lock:add to TOS, xchg, etc
2250      // are all faster when the write buffer is populated.
2251      movptr (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
2252      if (os::is_MP()) {
2253         lock (); addl (Address(rsp, 0), 0);
2254      }
2255      cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
2256      jccb  (Assembler::notZero, LSuccess);
2257
2258      movptr (boxReg, (int32_t)NULL_WORD);                   // box is really EAX
2259      if (os::is_MP()) { lock(); }
2260      cmpxchgptr(r15_thread, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
2261      jccb  (Assembler::notEqual, LSuccess);
2262      // Intentional fall-through into slow-path
2263
2264      bind  (LGoSlowPath);
2265      orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2266      jmpb  (DONE_LABEL);
2267
2268      bind  (LSuccess);
2269      testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2270      jmpb  (DONE_LABEL);
2271    }
2272
2273    bind  (Stacked);
2274    movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2275    if (os::is_MP()) { lock(); }
2276    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2277
2278    if (EmitSync & 65536) {
2279       bind (CheckSucc);
2280    }
2281#endif
2282    bind(DONE_LABEL);
2283    // Avoid branch to branch on AMD processors
2284    if (EmitSync & 32768) {
2285       nop();
2286    }
2287  }
2288}
2289#endif // COMPILER2
2290
2291void MacroAssembler::c2bool(Register x) {
2292  // implements x == 0 ? 0 : 1
2293  // note: must only look at least-significant byte of x
2294  //       since C-style booleans are stored in one byte
2295  //       only! (was bug)
2296  andl(x, 0xFF);
2297  setb(Assembler::notZero, x);
2298}
2299
2300// Wouldn't need if AddressLiteral version had new name
2301void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2302  Assembler::call(L, rtype);
2303}
2304
2305void MacroAssembler::call(Register entry) {
2306  Assembler::call(entry);
2307}
2308
2309void MacroAssembler::call(AddressLiteral entry) {
2310  if (reachable(entry)) {
2311    Assembler::call_literal(entry.target(), entry.rspec());
2312  } else {
2313    lea(rscratch1, entry);
2314    Assembler::call(rscratch1);
2315  }
2316}
2317
2318void MacroAssembler::ic_call(address entry) {
2319  RelocationHolder rh = virtual_call_Relocation::spec(pc());
2320  movptr(rax, (intptr_t)Universe::non_oop_word());
2321  call(AddressLiteral(entry, rh));
2322}
2323
2324// Implementation of call_VM versions
2325
2326void MacroAssembler::call_VM(Register oop_result,
2327                             address entry_point,
2328                             bool check_exceptions) {
2329  Label C, E;
2330  call(C, relocInfo::none);
2331  jmp(E);
2332
2333  bind(C);
2334  call_VM_helper(oop_result, entry_point, 0, check_exceptions);
2335  ret(0);
2336
2337  bind(E);
2338}
2339
2340void MacroAssembler::call_VM(Register oop_result,
2341                             address entry_point,
2342                             Register arg_1,
2343                             bool check_exceptions) {
2344  Label C, E;
2345  call(C, relocInfo::none);
2346  jmp(E);
2347
2348  bind(C);
2349  pass_arg1(this, arg_1);
2350  call_VM_helper(oop_result, entry_point, 1, check_exceptions);
2351  ret(0);
2352
2353  bind(E);
2354}
2355
2356void MacroAssembler::call_VM(Register oop_result,
2357                             address entry_point,
2358                             Register arg_1,
2359                             Register arg_2,
2360                             bool check_exceptions) {
2361  Label C, E;
2362  call(C, relocInfo::none);
2363  jmp(E);
2364
2365  bind(C);
2366
2367  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2368
2369  pass_arg2(this, arg_2);
2370  pass_arg1(this, arg_1);
2371  call_VM_helper(oop_result, entry_point, 2, check_exceptions);
2372  ret(0);
2373
2374  bind(E);
2375}
2376
2377void MacroAssembler::call_VM(Register oop_result,
2378                             address entry_point,
2379                             Register arg_1,
2380                             Register arg_2,
2381                             Register arg_3,
2382                             bool check_exceptions) {
2383  Label C, E;
2384  call(C, relocInfo::none);
2385  jmp(E);
2386
2387  bind(C);
2388
2389  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2390  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2391  pass_arg3(this, arg_3);
2392
2393  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2394  pass_arg2(this, arg_2);
2395
2396  pass_arg1(this, arg_1);
2397  call_VM_helper(oop_result, entry_point, 3, check_exceptions);
2398  ret(0);
2399
2400  bind(E);
2401}
2402
2403void MacroAssembler::call_VM(Register oop_result,
2404                             Register last_java_sp,
2405                             address entry_point,
2406                             int number_of_arguments,
2407                             bool check_exceptions) {
2408  Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2409  call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2410}
2411
2412void MacroAssembler::call_VM(Register oop_result,
2413                             Register last_java_sp,
2414                             address entry_point,
2415                             Register arg_1,
2416                             bool check_exceptions) {
2417  pass_arg1(this, arg_1);
2418  call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2419}
2420
2421void MacroAssembler::call_VM(Register oop_result,
2422                             Register last_java_sp,
2423                             address entry_point,
2424                             Register arg_1,
2425                             Register arg_2,
2426                             bool check_exceptions) {
2427
2428  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2429  pass_arg2(this, arg_2);
2430  pass_arg1(this, arg_1);
2431  call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2432}
2433
2434void MacroAssembler::call_VM(Register oop_result,
2435                             Register last_java_sp,
2436                             address entry_point,
2437                             Register arg_1,
2438                             Register arg_2,
2439                             Register arg_3,
2440                             bool check_exceptions) {
2441  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2442  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2443  pass_arg3(this, arg_3);
2444  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2445  pass_arg2(this, arg_2);
2446  pass_arg1(this, arg_1);
2447  call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2448}
2449
2450void MacroAssembler::super_call_VM(Register oop_result,
2451                                   Register last_java_sp,
2452                                   address entry_point,
2453                                   int number_of_arguments,
2454                                   bool check_exceptions) {
2455  Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2456  MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2457}
2458
2459void MacroAssembler::super_call_VM(Register oop_result,
2460                                   Register last_java_sp,
2461                                   address entry_point,
2462                                   Register arg_1,
2463                                   bool check_exceptions) {
2464  pass_arg1(this, arg_1);
2465  super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2466}
2467
2468void MacroAssembler::super_call_VM(Register oop_result,
2469                                   Register last_java_sp,
2470                                   address entry_point,
2471                                   Register arg_1,
2472                                   Register arg_2,
2473                                   bool check_exceptions) {
2474
2475  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2476  pass_arg2(this, arg_2);
2477  pass_arg1(this, arg_1);
2478  super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2479}
2480
2481void MacroAssembler::super_call_VM(Register oop_result,
2482                                   Register last_java_sp,
2483                                   address entry_point,
2484                                   Register arg_1,
2485                                   Register arg_2,
2486                                   Register arg_3,
2487                                   bool check_exceptions) {
2488  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2489  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2490  pass_arg3(this, arg_3);
2491  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2492  pass_arg2(this, arg_2);
2493  pass_arg1(this, arg_1);
2494  super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2495}
2496
2497void MacroAssembler::call_VM_base(Register oop_result,
2498                                  Register java_thread,
2499                                  Register last_java_sp,
2500                                  address  entry_point,
2501                                  int      number_of_arguments,
2502                                  bool     check_exceptions) {
2503  // determine java_thread register
2504  if (!java_thread->is_valid()) {
2505#ifdef _LP64
2506    java_thread = r15_thread;
2507#else
2508    java_thread = rdi;
2509    get_thread(java_thread);
2510#endif // LP64
2511  }
2512  // determine last_java_sp register
2513  if (!last_java_sp->is_valid()) {
2514    last_java_sp = rsp;
2515  }
2516  // debugging support
2517  assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
2518  LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2519#ifdef ASSERT
2520  // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2521  // r12 is the heapbase.
2522  LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2523#endif // ASSERT
2524
2525  assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
2526  assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2527
2528  // push java thread (becomes first argument of C function)
2529
2530  NOT_LP64(push(java_thread); number_of_arguments++);
2531  LP64_ONLY(mov(c_rarg0, r15_thread));
2532
2533  // set last Java frame before call
2534  assert(last_java_sp != rbp, "can't use ebp/rbp");
2535
2536  // Only interpreter should have to set fp
2537  set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2538
2539  // do the call, remove parameters
2540  MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2541
2542  // restore the thread (cannot use the pushed argument since arguments
2543  // may be overwritten by C code generated by an optimizing compiler);
2544  // however can use the register value directly if it is callee saved.
2545  if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2546    // rdi & rsi (also r15) are callee saved -> nothing to do
2547#ifdef ASSERT
2548    guarantee(java_thread != rax, "change this code");
2549    push(rax);
2550    { Label L;
2551      get_thread(rax);
2552      cmpptr(java_thread, rax);
2553      jcc(Assembler::equal, L);
2554      STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2555      bind(L);
2556    }
2557    pop(rax);
2558#endif
2559  } else {
2560    get_thread(java_thread);
2561  }
2562  // reset last Java frame
2563  // Only interpreter should have to clear fp
2564  reset_last_Java_frame(java_thread, true, false);
2565
2566#ifndef CC_INTERP
2567   // C++ interp handles this in the interpreter
2568  check_and_handle_popframe(java_thread);
2569  check_and_handle_earlyret(java_thread);
2570#endif /* CC_INTERP */
2571
2572  if (check_exceptions) {
2573    // check for pending exceptions (java_thread is set upon return)
2574    cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2575#ifndef _LP64
2576    jump_cc(Assembler::notEqual,
2577            RuntimeAddress(StubRoutines::forward_exception_entry()));
2578#else
2579    // This used to conditionally jump to forward_exception however it is
2580    // possible if we relocate that the branch will not reach. So we must jump
2581    // around so we can always reach
2582
2583    Label ok;
2584    jcc(Assembler::equal, ok);
2585    jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2586    bind(ok);
2587#endif // LP64
2588  }
2589
2590  // get oop result if there is one and reset the value in the thread
2591  if (oop_result->is_valid()) {
2592    get_vm_result(oop_result, java_thread);
2593  }
2594}
2595
2596void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2597
2598  // Calculate the value for last_Java_sp
2599  // somewhat subtle. call_VM does an intermediate call
2600  // which places a return address on the stack just under the
2601  // stack pointer as the user finsihed with it. This allows
2602  // use to retrieve last_Java_pc from last_Java_sp[-1].
2603  // On 32bit we then have to push additional args on the stack to accomplish
2604  // the actual requested call. On 64bit call_VM only can use register args
2605  // so the only extra space is the return address that call_VM created.
2606  // This hopefully explains the calculations here.
2607
2608#ifdef _LP64
2609  // We've pushed one address, correct last_Java_sp
2610  lea(rax, Address(rsp, wordSize));
2611#else
2612  lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2613#endif // LP64
2614
2615  call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2616
2617}
2618
2619void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2620  call_VM_leaf_base(entry_point, number_of_arguments);
2621}
2622
2623void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2624  pass_arg0(this, arg_0);
2625  call_VM_leaf(entry_point, 1);
2626}
2627
2628void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2629
2630  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2631  pass_arg1(this, arg_1);
2632  pass_arg0(this, arg_0);
2633  call_VM_leaf(entry_point, 2);
2634}
2635
2636void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2637  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2638  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2639  pass_arg2(this, arg_2);
2640  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2641  pass_arg1(this, arg_1);
2642  pass_arg0(this, arg_0);
2643  call_VM_leaf(entry_point, 3);
2644}
2645
2646void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2647  pass_arg0(this, arg_0);
2648  MacroAssembler::call_VM_leaf_base(entry_point, 1);
2649}
2650
2651void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2652
2653  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2654  pass_arg1(this, arg_1);
2655  pass_arg0(this, arg_0);
2656  MacroAssembler::call_VM_leaf_base(entry_point, 2);
2657}
2658
2659void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2660  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2661  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2662  pass_arg2(this, arg_2);
2663  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2664  pass_arg1(this, arg_1);
2665  pass_arg0(this, arg_0);
2666  MacroAssembler::call_VM_leaf_base(entry_point, 3);
2667}
2668
2669void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2670  LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2671  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2672  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2673  pass_arg3(this, arg_3);
2674  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2675  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2676  pass_arg2(this, arg_2);
2677  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2678  pass_arg1(this, arg_1);
2679  pass_arg0(this, arg_0);
2680  MacroAssembler::call_VM_leaf_base(entry_point, 4);
2681}
2682
2683void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2684  movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2685  movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2686  verify_oop(oop_result, "broken oop in call_VM_base");
2687}
2688
2689void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2690  movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2691  movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2692}
2693
2694void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2695}
2696
2697void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2698}
2699
2700void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2701  if (reachable(src1)) {
2702    cmpl(as_Address(src1), imm);
2703  } else {
2704    lea(rscratch1, src1);
2705    cmpl(Address(rscratch1, 0), imm);
2706  }
2707}
2708
2709void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2710  assert(!src2.is_lval(), "use cmpptr");
2711  if (reachable(src2)) {
2712    cmpl(src1, as_Address(src2));
2713  } else {
2714    lea(rscratch1, src2);
2715    cmpl(src1, Address(rscratch1, 0));
2716  }
2717}
2718
2719void MacroAssembler::cmp32(Register src1, int32_t imm) {
2720  Assembler::cmpl(src1, imm);
2721}
2722
2723void MacroAssembler::cmp32(Register src1, Address src2) {
2724  Assembler::cmpl(src1, src2);
2725}
2726
2727void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2728  ucomisd(opr1, opr2);
2729
2730  Label L;
2731  if (unordered_is_less) {
2732    movl(dst, -1);
2733    jcc(Assembler::parity, L);
2734    jcc(Assembler::below , L);
2735    movl(dst, 0);
2736    jcc(Assembler::equal , L);
2737    increment(dst);
2738  } else { // unordered is greater
2739    movl(dst, 1);
2740    jcc(Assembler::parity, L);
2741    jcc(Assembler::above , L);
2742    movl(dst, 0);
2743    jcc(Assembler::equal , L);
2744    decrementl(dst);
2745  }
2746  bind(L);
2747}
2748
2749void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2750  ucomiss(opr1, opr2);
2751
2752  Label L;
2753  if (unordered_is_less) {
2754    movl(dst, -1);
2755    jcc(Assembler::parity, L);
2756    jcc(Assembler::below , L);
2757    movl(dst, 0);
2758    jcc(Assembler::equal , L);
2759    increment(dst);
2760  } else { // unordered is greater
2761    movl(dst, 1);
2762    jcc(Assembler::parity, L);
2763    jcc(Assembler::above , L);
2764    movl(dst, 0);
2765    jcc(Assembler::equal , L);
2766    decrementl(dst);
2767  }
2768  bind(L);
2769}
2770
2771
2772void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2773  if (reachable(src1)) {
2774    cmpb(as_Address(src1), imm);
2775  } else {
2776    lea(rscratch1, src1);
2777    cmpb(Address(rscratch1, 0), imm);
2778  }
2779}
2780
2781void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2782#ifdef _LP64
2783  if (src2.is_lval()) {
2784    movptr(rscratch1, src2);
2785    Assembler::cmpq(src1, rscratch1);
2786  } else if (reachable(src2)) {
2787    cmpq(src1, as_Address(src2));
2788  } else {
2789    lea(rscratch1, src2);
2790    Assembler::cmpq(src1, Address(rscratch1, 0));
2791  }
2792#else
2793  if (src2.is_lval()) {
2794    cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2795  } else {
2796    cmpl(src1, as_Address(src2));
2797  }
2798#endif // _LP64
2799}
2800
2801void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2802  assert(src2.is_lval(), "not a mem-mem compare");
2803#ifdef _LP64
2804  // moves src2's literal address
2805  movptr(rscratch1, src2);
2806  Assembler::cmpq(src1, rscratch1);
2807#else
2808  cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2809#endif // _LP64
2810}
2811
2812void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2813  if (reachable(adr)) {
2814    if (os::is_MP())
2815      lock();
2816    cmpxchgptr(reg, as_Address(adr));
2817  } else {
2818    lea(rscratch1, adr);
2819    if (os::is_MP())
2820      lock();
2821    cmpxchgptr(reg, Address(rscratch1, 0));
2822  }
2823}
2824
2825void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2826  LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2827}
2828
2829void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2830  if (reachable(src)) {
2831    Assembler::comisd(dst, as_Address(src));
2832  } else {
2833    lea(rscratch1, src);
2834    Assembler::comisd(dst, Address(rscratch1, 0));
2835  }
2836}
2837
2838void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2839  if (reachable(src)) {
2840    Assembler::comiss(dst, as_Address(src));
2841  } else {
2842    lea(rscratch1, src);
2843    Assembler::comiss(dst, Address(rscratch1, 0));
2844  }
2845}
2846
2847
2848void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2849  Condition negated_cond = negate_condition(cond);
2850  Label L;
2851  jcc(negated_cond, L);
2852  pushf(); // Preserve flags
2853  atomic_incl(counter_addr);
2854  popf();
2855  bind(L);
2856}
2857
2858int MacroAssembler::corrected_idivl(Register reg) {
2859  // Full implementation of Java idiv and irem; checks for
2860  // special case as described in JVM spec., p.243 & p.271.
2861  // The function returns the (pc) offset of the idivl
2862  // instruction - may be needed for implicit exceptions.
2863  //
2864  //         normal case                           special case
2865  //
2866  // input : rax,: dividend                         min_int
2867  //         reg: divisor   (may not be rax,/rdx)   -1
2868  //
2869  // output: rax,: quotient  (= rax, idiv reg)       min_int
2870  //         rdx: remainder (= rax, irem reg)       0
2871  assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2872  const int min_int = 0x80000000;
2873  Label normal_case, special_case;
2874
2875  // check for special case
2876  cmpl(rax, min_int);
2877  jcc(Assembler::notEqual, normal_case);
2878  xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2879  cmpl(reg, -1);
2880  jcc(Assembler::equal, special_case);
2881
2882  // handle normal case
2883  bind(normal_case);
2884  cdql();
2885  int idivl_offset = offset();
2886  idivl(reg);
2887
2888  // normal and special case exit
2889  bind(special_case);
2890
2891  return idivl_offset;
2892}
2893
2894
2895
2896void MacroAssembler::decrementl(Register reg, int value) {
2897  if (value == min_jint) {subl(reg, value) ; return; }
2898  if (value <  0) { incrementl(reg, -value); return; }
2899  if (value == 0) {                        ; return; }
2900  if (value == 1 && UseIncDec) { decl(reg) ; return; }
2901  /* else */      { subl(reg, value)       ; return; }
2902}
2903
2904void MacroAssembler::decrementl(Address dst, int value) {
2905  if (value == min_jint) {subl(dst, value) ; return; }
2906  if (value <  0) { incrementl(dst, -value); return; }
2907  if (value == 0) {                        ; return; }
2908  if (value == 1 && UseIncDec) { decl(dst) ; return; }
2909  /* else */      { subl(dst, value)       ; return; }
2910}
2911
2912void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2913  assert (shift_value > 0, "illegal shift value");
2914  Label _is_positive;
2915  testl (reg, reg);
2916  jcc (Assembler::positive, _is_positive);
2917  int offset = (1 << shift_value) - 1 ;
2918
2919  if (offset == 1) {
2920    incrementl(reg);
2921  } else {
2922    addl(reg, offset);
2923  }
2924
2925  bind (_is_positive);
2926  sarl(reg, shift_value);
2927}
2928
2929void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2930  if (reachable(src)) {
2931    Assembler::divsd(dst, as_Address(src));
2932  } else {
2933    lea(rscratch1, src);
2934    Assembler::divsd(dst, Address(rscratch1, 0));
2935  }
2936}
2937
2938void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2939  if (reachable(src)) {
2940    Assembler::divss(dst, as_Address(src));
2941  } else {
2942    lea(rscratch1, src);
2943    Assembler::divss(dst, Address(rscratch1, 0));
2944  }
2945}
2946
2947// !defined(COMPILER2) is because of stupid core builds
2948#if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2)
2949void MacroAssembler::empty_FPU_stack() {
2950  if (VM_Version::supports_mmx()) {
2951    emms();
2952  } else {
2953    for (int i = 8; i-- > 0; ) ffree(i);
2954  }
2955}
2956#endif // !LP64 || C1 || !C2
2957
2958
2959// Defines obj, preserves var_size_in_bytes
2960void MacroAssembler::eden_allocate(Register obj,
2961                                   Register var_size_in_bytes,
2962                                   int con_size_in_bytes,
2963                                   Register t1,
2964                                   Label& slow_case) {
2965  assert(obj == rax, "obj must be in rax, for cmpxchg");
2966  assert_different_registers(obj, var_size_in_bytes, t1);
2967  if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
2968    jmp(slow_case);
2969  } else {
2970    Register end = t1;
2971    Label retry;
2972    bind(retry);
2973    ExternalAddress heap_top((address) Universe::heap()->top_addr());
2974    movptr(obj, heap_top);
2975    if (var_size_in_bytes == noreg) {
2976      lea(end, Address(obj, con_size_in_bytes));
2977    } else {
2978      lea(end, Address(obj, var_size_in_bytes, Address::times_1));
2979    }
2980    // if end < obj then we wrapped around => object too long => slow case
2981    cmpptr(end, obj);
2982    jcc(Assembler::below, slow_case);
2983    cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
2984    jcc(Assembler::above, slow_case);
2985    // Compare obj with the top addr, and if still equal, store the new top addr in
2986    // end at the address of the top addr pointer. Sets ZF if was equal, and clears
2987    // it otherwise. Use lock prefix for atomicity on MPs.
2988    locked_cmpxchgptr(end, heap_top);
2989    jcc(Assembler::notEqual, retry);
2990  }
2991}
2992
2993void MacroAssembler::enter() {
2994  push(rbp);
2995  mov(rbp, rsp);
2996}
2997
2998// A 5 byte nop that is safe for patching (see patch_verified_entry)
2999void MacroAssembler::fat_nop() {
3000  if (UseAddressNop) {
3001    addr_nop_5();
3002  } else {
3003    emit_int8(0x26); // es:
3004    emit_int8(0x2e); // cs:
3005    emit_int8(0x64); // fs:
3006    emit_int8(0x65); // gs:
3007    emit_int8((unsigned char)0x90);
3008  }
3009}
3010
3011void MacroAssembler::fcmp(Register tmp) {
3012  fcmp(tmp, 1, true, true);
3013}
3014
3015void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
3016  assert(!pop_right || pop_left, "usage error");
3017  if (VM_Version::supports_cmov()) {
3018    assert(tmp == noreg, "unneeded temp");
3019    if (pop_left) {
3020      fucomip(index);
3021    } else {
3022      fucomi(index);
3023    }
3024    if (pop_right) {
3025      fpop();
3026    }
3027  } else {
3028    assert(tmp != noreg, "need temp");
3029    if (pop_left) {
3030      if (pop_right) {
3031        fcompp();
3032      } else {
3033        fcomp(index);
3034      }
3035    } else {
3036      fcom(index);
3037    }
3038    // convert FPU condition into eflags condition via rax,
3039    save_rax(tmp);
3040    fwait(); fnstsw_ax();
3041    sahf();
3042    restore_rax(tmp);
3043  }
3044  // condition codes set as follows:
3045  //
3046  // CF (corresponds to C0) if x < y
3047  // PF (corresponds to C2) if unordered
3048  // ZF (corresponds to C3) if x = y
3049}
3050
3051void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
3052  fcmp2int(dst, unordered_is_less, 1, true, true);
3053}
3054
3055void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
3056  fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
3057  Label L;
3058  if (unordered_is_less) {
3059    movl(dst, -1);
3060    jcc(Assembler::parity, L);
3061    jcc(Assembler::below , L);
3062    movl(dst, 0);
3063    jcc(Assembler::equal , L);
3064    increment(dst);
3065  } else { // unordered is greater
3066    movl(dst, 1);
3067    jcc(Assembler::parity, L);
3068    jcc(Assembler::above , L);
3069    movl(dst, 0);
3070    jcc(Assembler::equal , L);
3071    decrementl(dst);
3072  }
3073  bind(L);
3074}
3075
3076void MacroAssembler::fld_d(AddressLiteral src) {
3077  fld_d(as_Address(src));
3078}
3079
3080void MacroAssembler::fld_s(AddressLiteral src) {
3081  fld_s(as_Address(src));
3082}
3083
3084void MacroAssembler::fld_x(AddressLiteral src) {
3085  Assembler::fld_x(as_Address(src));
3086}
3087
3088void MacroAssembler::fldcw(AddressLiteral src) {
3089  Assembler::fldcw(as_Address(src));
3090}
3091
3092void MacroAssembler::pow_exp_core_encoding() {
3093  // kills rax, rcx, rdx
3094  subptr(rsp,sizeof(jdouble));
3095  // computes 2^X. Stack: X ...
3096  // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
3097  // keep it on the thread's stack to compute 2^int(X) later
3098  // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
3099  // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
3100  fld_s(0);                 // Stack: X X ...
3101  frndint();                // Stack: int(X) X ...
3102  fsuba(1);                 // Stack: int(X) X-int(X) ...
3103  fistp_s(Address(rsp,0));  // move int(X) as integer to thread's stack. Stack: X-int(X) ...
3104  f2xm1();                  // Stack: 2^(X-int(X))-1 ...
3105  fld1();                   // Stack: 1 2^(X-int(X))-1 ...
3106  faddp(1);                 // Stack: 2^(X-int(X))
3107  // computes 2^(int(X)): add exponent bias (1023) to int(X), then
3108  // shift int(X)+1023 to exponent position.
3109  // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
3110  // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
3111  // values so detect them and set result to NaN.
3112  movl(rax,Address(rsp,0));
3113  movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
3114  addl(rax, 1023);
3115  movl(rdx,rax);
3116  shll(rax,20);
3117  // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
3118  addl(rdx,1);
3119  // Check that 1 < int(X)+1023+1 < 2048
3120  // in 3 steps:
3121  // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
3122  // 2- (int(X)+1023+1)&-2048 != 0
3123  // 3- (int(X)+1023+1)&-2048 != 1
3124  // Do 2- first because addl just updated the flags.
3125  cmov32(Assembler::equal,rax,rcx);
3126  cmpl(rdx,1);
3127  cmov32(Assembler::equal,rax,rcx);
3128  testl(rdx,rcx);
3129  cmov32(Assembler::notEqual,rax,rcx);
3130  movl(Address(rsp,4),rax);
3131  movl(Address(rsp,0),0);
3132  fmul_d(Address(rsp,0));   // Stack: 2^X ...
3133  addptr(rsp,sizeof(jdouble));
3134}
3135
3136void MacroAssembler::increase_precision() {
3137  subptr(rsp, BytesPerWord);
3138  fnstcw(Address(rsp, 0));
3139  movl(rax, Address(rsp, 0));
3140  orl(rax, 0x300);
3141  push(rax);
3142  fldcw(Address(rsp, 0));
3143  pop(rax);
3144}
3145
3146void MacroAssembler::restore_precision() {
3147  fldcw(Address(rsp, 0));
3148  addptr(rsp, BytesPerWord);
3149}
3150
3151void MacroAssembler::fast_pow() {
3152  // computes X^Y = 2^(Y * log2(X))
3153  // if fast computation is not possible, result is NaN. Requires
3154  // fallback from user of this macro.
3155  // increase precision for intermediate steps of the computation
3156  BLOCK_COMMENT("fast_pow {");
3157  increase_precision();
3158  fyl2x();                 // Stack: (Y*log2(X)) ...
3159  pow_exp_core_encoding(); // Stack: exp(X) ...
3160  restore_precision();
3161  BLOCK_COMMENT("} fast_pow");
3162}
3163
3164void MacroAssembler::fast_exp() {
3165  // computes exp(X) = 2^(X * log2(e))
3166  // if fast computation is not possible, result is NaN. Requires
3167  // fallback from user of this macro.
3168  // increase precision for intermediate steps of the computation
3169  increase_precision();
3170  fldl2e();                // Stack: log2(e) X ...
3171  fmulp(1);                // Stack: (X*log2(e)) ...
3172  pow_exp_core_encoding(); // Stack: exp(X) ...
3173  restore_precision();
3174}
3175
3176void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
3177  // kills rax, rcx, rdx
3178  // pow and exp needs 2 extra registers on the fpu stack.
3179  Label slow_case, done;
3180  Register tmp = noreg;
3181  if (!VM_Version::supports_cmov()) {
3182    // fcmp needs a temporary so preserve rdx,
3183    tmp = rdx;
3184  }
3185  Register tmp2 = rax;
3186  Register tmp3 = rcx;
3187
3188  if (is_exp) {
3189    // Stack: X
3190    fld_s(0);                   // duplicate argument for runtime call. Stack: X X
3191    fast_exp();                 // Stack: exp(X) X
3192    fcmp(tmp, 0, false, false); // Stack: exp(X) X
3193    // exp(X) not equal to itself: exp(X) is NaN go to slow case.
3194    jcc(Assembler::parity, slow_case);
3195    // get rid of duplicate argument. Stack: exp(X)
3196    if (num_fpu_regs_in_use > 0) {
3197      fxch();
3198      fpop();
3199    } else {
3200      ffree(1);
3201    }
3202    jmp(done);
3203  } else {
3204    // Stack: X Y
3205    Label x_negative, y_odd;
3206
3207    fldz();                     // Stack: 0 X Y
3208    fcmp(tmp, 1, true, false);  // Stack: X Y
3209    jcc(Assembler::above, x_negative);
3210
3211    // X >= 0
3212
3213    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
3214    fld_s(1);                   // Stack: X Y X Y
3215    fast_pow();                 // Stack: X^Y X Y
3216    fcmp(tmp, 0, false, false); // Stack: X^Y X Y
3217    // X^Y not equal to itself: X^Y is NaN go to slow case.
3218    jcc(Assembler::parity, slow_case);
3219    // get rid of duplicate arguments. Stack: X^Y
3220    if (num_fpu_regs_in_use > 0) {
3221      fxch(); fpop();
3222      fxch(); fpop();
3223    } else {
3224      ffree(2);
3225      ffree(1);
3226    }
3227    jmp(done);
3228
3229    // X <= 0
3230    bind(x_negative);
3231
3232    fld_s(1);                   // Stack: Y X Y
3233    frndint();                  // Stack: int(Y) X Y
3234    fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
3235    jcc(Assembler::notEqual, slow_case);
3236
3237    subptr(rsp, 8);
3238
3239    // For X^Y, when X < 0, Y has to be an integer and the final
3240    // result depends on whether it's odd or even. We just checked
3241    // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
3242    // integer to test its parity. If int(Y) is huge and doesn't fit
3243    // in the 64 bit integer range, the integer indefinite value will
3244    // end up in the gp registers. Huge numbers are all even, the
3245    // integer indefinite number is even so it's fine.
3246
3247#ifdef ASSERT
3248    // Let's check we don't end up with an integer indefinite number
3249    // when not expected. First test for huge numbers: check whether
3250    // int(Y)+1 == int(Y) which is true for very large numbers and
3251    // those are all even. A 64 bit integer is guaranteed to not
3252    // overflow for numbers where y+1 != y (when precision is set to
3253    // double precision).
3254    Label y_not_huge;
3255
3256    fld1();                     // Stack: 1 int(Y) X Y
3257    fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
3258
3259#ifdef _LP64
3260    // trip to memory to force the precision down from double extended
3261    // precision
3262    fstp_d(Address(rsp, 0));
3263    fld_d(Address(rsp, 0));
3264#endif
3265
3266    fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
3267#endif
3268
3269    // move int(Y) as 64 bit integer to thread's stack
3270    fistp_d(Address(rsp,0));    // Stack: X Y
3271
3272#ifdef ASSERT
3273    jcc(Assembler::notEqual, y_not_huge);
3274
3275    // Y is huge so we know it's even. It may not fit in a 64 bit
3276    // integer and we don't want the debug code below to see the
3277    // integer indefinite value so overwrite int(Y) on the thread's
3278    // stack with 0.
3279    movl(Address(rsp, 0), 0);
3280    movl(Address(rsp, 4), 0);
3281
3282    bind(y_not_huge);
3283#endif
3284
3285    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
3286    fld_s(1);                   // Stack: X Y X Y
3287    fabs();                     // Stack: abs(X) Y X Y
3288    fast_pow();                 // Stack: abs(X)^Y X Y
3289    fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
3290    // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
3291
3292    pop(tmp2);
3293    NOT_LP64(pop(tmp3));
3294    jcc(Assembler::parity, slow_case);
3295
3296#ifdef ASSERT
3297    // Check that int(Y) is not integer indefinite value (int
3298    // overflow). Shouldn't happen because for values that would
3299    // overflow, 1+int(Y)==Y which was tested earlier.
3300#ifndef _LP64
3301    {
3302      Label integer;
3303      testl(tmp2, tmp2);
3304      jcc(Assembler::notZero, integer);
3305      cmpl(tmp3, 0x80000000);
3306      jcc(Assembler::notZero, integer);
3307      STOP("integer indefinite value shouldn't be seen here");
3308      bind(integer);
3309    }
3310#else
3311    {
3312      Label integer;
3313      mov(tmp3, tmp2); // preserve tmp2 for parity check below
3314      shlq(tmp3, 1);
3315      jcc(Assembler::carryClear, integer);
3316      jcc(Assembler::notZero, integer);
3317      STOP("integer indefinite value shouldn't be seen here");
3318      bind(integer);
3319    }
3320#endif
3321#endif
3322
3323    // get rid of duplicate arguments. Stack: X^Y
3324    if (num_fpu_regs_in_use > 0) {
3325      fxch(); fpop();
3326      fxch(); fpop();
3327    } else {
3328      ffree(2);
3329      ffree(1);
3330    }
3331
3332    testl(tmp2, 1);
3333    jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
3334    // X <= 0, Y even: X^Y = -abs(X)^Y
3335
3336    fchs();                     // Stack: -abs(X)^Y Y
3337    jmp(done);
3338  }
3339
3340  // slow case: runtime call
3341  bind(slow_case);
3342
3343  fpop();                       // pop incorrect result or int(Y)
3344
3345  fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
3346                      is_exp ? 1 : 2, num_fpu_regs_in_use);
3347
3348  // Come here with result in F-TOS
3349  bind(done);
3350}
3351
3352void MacroAssembler::fpop() {
3353  ffree();
3354  fincstp();
3355}
3356
3357void MacroAssembler::fremr(Register tmp) {
3358  save_rax(tmp);
3359  { Label L;
3360    bind(L);
3361    fprem();
3362    fwait(); fnstsw_ax();
3363#ifdef _LP64
3364    testl(rax, 0x400);
3365    jcc(Assembler::notEqual, L);
3366#else
3367    sahf();
3368    jcc(Assembler::parity, L);
3369#endif // _LP64
3370  }
3371  restore_rax(tmp);
3372  // Result is in ST0.
3373  // Note: fxch & fpop to get rid of ST1
3374  // (otherwise FPU stack could overflow eventually)
3375  fxch(1);
3376  fpop();
3377}
3378
3379
3380void MacroAssembler::incrementl(AddressLiteral dst) {
3381  if (reachable(dst)) {
3382    incrementl(as_Address(dst));
3383  } else {
3384    lea(rscratch1, dst);
3385    incrementl(Address(rscratch1, 0));
3386  }
3387}
3388
3389void MacroAssembler::incrementl(ArrayAddress dst) {
3390  incrementl(as_Address(dst));
3391}
3392
3393void MacroAssembler::incrementl(Register reg, int value) {
3394  if (value == min_jint) {addl(reg, value) ; return; }
3395  if (value <  0) { decrementl(reg, -value); return; }
3396  if (value == 0) {                        ; return; }
3397  if (value == 1 && UseIncDec) { incl(reg) ; return; }
3398  /* else */      { addl(reg, value)       ; return; }
3399}
3400
3401void MacroAssembler::incrementl(Address dst, int value) {
3402  if (value == min_jint) {addl(dst, value) ; return; }
3403  if (value <  0) { decrementl(dst, -value); return; }
3404  if (value == 0) {                        ; return; }
3405  if (value == 1 && UseIncDec) { incl(dst) ; return; }
3406  /* else */      { addl(dst, value)       ; return; }
3407}
3408
3409void MacroAssembler::jump(AddressLiteral dst) {
3410  if (reachable(dst)) {
3411    jmp_literal(dst.target(), dst.rspec());
3412  } else {
3413    lea(rscratch1, dst);
3414    jmp(rscratch1);
3415  }
3416}
3417
3418void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3419  if (reachable(dst)) {
3420    InstructionMark im(this);
3421    relocate(dst.reloc());
3422    const int short_size = 2;
3423    const int long_size = 6;
3424    int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3425    if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3426      // 0111 tttn #8-bit disp
3427      emit_int8(0x70 | cc);
3428      emit_int8((offs - short_size) & 0xFF);
3429    } else {
3430      // 0000 1111 1000 tttn #32-bit disp
3431      emit_int8(0x0F);
3432      emit_int8((unsigned char)(0x80 | cc));
3433      emit_int32(offs - long_size);
3434    }
3435  } else {
3436#ifdef ASSERT
3437    warning("reversing conditional branch");
3438#endif /* ASSERT */
3439    Label skip;
3440    jccb(reverse[cc], skip);
3441    lea(rscratch1, dst);
3442    Assembler::jmp(rscratch1);
3443    bind(skip);
3444  }
3445}
3446
3447void MacroAssembler::ldmxcsr(AddressLiteral src) {
3448  if (reachable(src)) {
3449    Assembler::ldmxcsr(as_Address(src));
3450  } else {
3451    lea(rscratch1, src);
3452    Assembler::ldmxcsr(Address(rscratch1, 0));
3453  }
3454}
3455
3456int MacroAssembler::load_signed_byte(Register dst, Address src) {
3457  int off;
3458  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3459    off = offset();
3460    movsbl(dst, src); // movsxb
3461  } else {
3462    off = load_unsigned_byte(dst, src);
3463    shll(dst, 24);
3464    sarl(dst, 24);
3465  }
3466  return off;
3467}
3468
3469// Note: load_signed_short used to be called load_signed_word.
3470// Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3471// manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3472// The term "word" in HotSpot means a 32- or 64-bit machine word.
3473int MacroAssembler::load_signed_short(Register dst, Address src) {
3474  int off;
3475  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3476    // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3477    // version but this is what 64bit has always done. This seems to imply
3478    // that users are only using 32bits worth.
3479    off = offset();
3480    movswl(dst, src); // movsxw
3481  } else {
3482    off = load_unsigned_short(dst, src);
3483    shll(dst, 16);
3484    sarl(dst, 16);
3485  }
3486  return off;
3487}
3488
3489int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3490  // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3491  // and "3.9 Partial Register Penalties", p. 22).
3492  int off;
3493  if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3494    off = offset();
3495    movzbl(dst, src); // movzxb
3496  } else {
3497    xorl(dst, dst);
3498    off = offset();
3499    movb(dst, src);
3500  }
3501  return off;
3502}
3503
3504// Note: load_unsigned_short used to be called load_unsigned_word.
3505int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3506  // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3507  // and "3.9 Partial Register Penalties", p. 22).
3508  int off;
3509  if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3510    off = offset();
3511    movzwl(dst, src); // movzxw
3512  } else {
3513    xorl(dst, dst);
3514    off = offset();
3515    movw(dst, src);
3516  }
3517  return off;
3518}
3519
3520void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3521  switch (size_in_bytes) {
3522#ifndef _LP64
3523  case  8:
3524    assert(dst2 != noreg, "second dest register required");
3525    movl(dst,  src);
3526    movl(dst2, src.plus_disp(BytesPerInt));
3527    break;
3528#else
3529  case  8:  movq(dst, src); break;
3530#endif
3531  case  4:  movl(dst, src); break;
3532  case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3533  case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3534  default:  ShouldNotReachHere();
3535  }
3536}
3537
3538void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3539  switch (size_in_bytes) {
3540#ifndef _LP64
3541  case  8:
3542    assert(src2 != noreg, "second source register required");
3543    movl(dst,                        src);
3544    movl(dst.plus_disp(BytesPerInt), src2);
3545    break;
3546#else
3547  case  8:  movq(dst, src); break;
3548#endif
3549  case  4:  movl(dst, src); break;
3550  case  2:  movw(dst, src); break;
3551  case  1:  movb(dst, src); break;
3552  default:  ShouldNotReachHere();
3553  }
3554}
3555
3556void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3557  if (reachable(dst)) {
3558    movl(as_Address(dst), src);
3559  } else {
3560    lea(rscratch1, dst);
3561    movl(Address(rscratch1, 0), src);
3562  }
3563}
3564
3565void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3566  if (reachable(src)) {
3567    movl(dst, as_Address(src));
3568  } else {
3569    lea(rscratch1, src);
3570    movl(dst, Address(rscratch1, 0));
3571  }
3572}
3573
3574// C++ bool manipulation
3575
3576void MacroAssembler::movbool(Register dst, Address src) {
3577  if(sizeof(bool) == 1)
3578    movb(dst, src);
3579  else if(sizeof(bool) == 2)
3580    movw(dst, src);
3581  else if(sizeof(bool) == 4)
3582    movl(dst, src);
3583  else
3584    // unsupported
3585    ShouldNotReachHere();
3586}
3587
3588void MacroAssembler::movbool(Address dst, bool boolconst) {
3589  if(sizeof(bool) == 1)
3590    movb(dst, (int) boolconst);
3591  else if(sizeof(bool) == 2)
3592    movw(dst, (int) boolconst);
3593  else if(sizeof(bool) == 4)
3594    movl(dst, (int) boolconst);
3595  else
3596    // unsupported
3597    ShouldNotReachHere();
3598}
3599
3600void MacroAssembler::movbool(Address dst, Register src) {
3601  if(sizeof(bool) == 1)
3602    movb(dst, src);
3603  else if(sizeof(bool) == 2)
3604    movw(dst, src);
3605  else if(sizeof(bool) == 4)
3606    movl(dst, src);
3607  else
3608    // unsupported
3609    ShouldNotReachHere();
3610}
3611
3612void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3613  movb(as_Address(dst), src);
3614}
3615
3616void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3617  if (reachable(src)) {
3618    movdl(dst, as_Address(src));
3619  } else {
3620    lea(rscratch1, src);
3621    movdl(dst, Address(rscratch1, 0));
3622  }
3623}
3624
3625void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3626  if (reachable(src)) {
3627    movq(dst, as_Address(src));
3628  } else {
3629    lea(rscratch1, src);
3630    movq(dst, Address(rscratch1, 0));
3631  }
3632}
3633
3634void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3635  if (reachable(src)) {
3636    if (UseXmmLoadAndClearUpper) {
3637      movsd (dst, as_Address(src));
3638    } else {
3639      movlpd(dst, as_Address(src));
3640    }
3641  } else {
3642    lea(rscratch1, src);
3643    if (UseXmmLoadAndClearUpper) {
3644      movsd (dst, Address(rscratch1, 0));
3645    } else {
3646      movlpd(dst, Address(rscratch1, 0));
3647    }
3648  }
3649}
3650
3651void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3652  if (reachable(src)) {
3653    movss(dst, as_Address(src));
3654  } else {
3655    lea(rscratch1, src);
3656    movss(dst, Address(rscratch1, 0));
3657  }
3658}
3659
3660void MacroAssembler::movptr(Register dst, Register src) {
3661  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3662}
3663
3664void MacroAssembler::movptr(Register dst, Address src) {
3665  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3666}
3667
3668// src should NEVER be a real pointer. Use AddressLiteral for true pointers
3669void MacroAssembler::movptr(Register dst, intptr_t src) {
3670  LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3671}
3672
3673void MacroAssembler::movptr(Address dst, Register src) {
3674  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3675}
3676
3677void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
3678  if (reachable(src)) {
3679    Assembler::movdqu(dst, as_Address(src));
3680  } else {
3681    lea(rscratch1, src);
3682    Assembler::movdqu(dst, Address(rscratch1, 0));
3683  }
3684}
3685
3686void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3687  if (reachable(src)) {
3688    Assembler::movdqa(dst, as_Address(src));
3689  } else {
3690    lea(rscratch1, src);
3691    Assembler::movdqa(dst, Address(rscratch1, 0));
3692  }
3693}
3694
3695void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3696  if (reachable(src)) {
3697    Assembler::movsd(dst, as_Address(src));
3698  } else {
3699    lea(rscratch1, src);
3700    Assembler::movsd(dst, Address(rscratch1, 0));
3701  }
3702}
3703
3704void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3705  if (reachable(src)) {
3706    Assembler::movss(dst, as_Address(src));
3707  } else {
3708    lea(rscratch1, src);
3709    Assembler::movss(dst, Address(rscratch1, 0));
3710  }
3711}
3712
3713void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3714  if (reachable(src)) {
3715    Assembler::mulsd(dst, as_Address(src));
3716  } else {
3717    lea(rscratch1, src);
3718    Assembler::mulsd(dst, Address(rscratch1, 0));
3719  }
3720}
3721
3722void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3723  if (reachable(src)) {
3724    Assembler::mulss(dst, as_Address(src));
3725  } else {
3726    lea(rscratch1, src);
3727    Assembler::mulss(dst, Address(rscratch1, 0));
3728  }
3729}
3730
3731void MacroAssembler::null_check(Register reg, int offset) {
3732  if (needs_explicit_null_check(offset)) {
3733    // provoke OS NULL exception if reg = NULL by
3734    // accessing M[reg] w/o changing any (non-CC) registers
3735    // NOTE: cmpl is plenty here to provoke a segv
3736    cmpptr(rax, Address(reg, 0));
3737    // Note: should probably use testl(rax, Address(reg, 0));
3738    //       may be shorter code (however, this version of
3739    //       testl needs to be implemented first)
3740  } else {
3741    // nothing to do, (later) access of M[reg + offset]
3742    // will provoke OS NULL exception if reg = NULL
3743  }
3744}
3745
3746void MacroAssembler::os_breakpoint() {
3747  // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3748  // (e.g., MSVC can't call ps() otherwise)
3749  call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3750}
3751
3752void MacroAssembler::pop_CPU_state() {
3753  pop_FPU_state();
3754  pop_IU_state();
3755}
3756
3757void MacroAssembler::pop_FPU_state() {
3758  NOT_LP64(frstor(Address(rsp, 0));)
3759  LP64_ONLY(fxrstor(Address(rsp, 0));)
3760  addptr(rsp, FPUStateSizeInWords * wordSize);
3761}
3762
3763void MacroAssembler::pop_IU_state() {
3764  popa();
3765  LP64_ONLY(addq(rsp, 8));
3766  popf();
3767}
3768
3769// Save Integer and Float state
3770// Warning: Stack must be 16 byte aligned (64bit)
3771void MacroAssembler::push_CPU_state() {
3772  push_IU_state();
3773  push_FPU_state();
3774}
3775
3776void MacroAssembler::push_FPU_state() {
3777  subptr(rsp, FPUStateSizeInWords * wordSize);
3778#ifndef _LP64
3779  fnsave(Address(rsp, 0));
3780  fwait();
3781#else
3782  fxsave(Address(rsp, 0));
3783#endif // LP64
3784}
3785
3786void MacroAssembler::push_IU_state() {
3787  // Push flags first because pusha kills them
3788  pushf();
3789  // Make sure rsp stays 16-byte aligned
3790  LP64_ONLY(subq(rsp, 8));
3791  pusha();
3792}
3793
3794void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
3795  // determine java_thread register
3796  if (!java_thread->is_valid()) {
3797    java_thread = rdi;
3798    get_thread(java_thread);
3799  }
3800  // we must set sp to zero to clear frame
3801  movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3802  if (clear_fp) {
3803    movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3804  }
3805
3806  if (clear_pc)
3807    movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3808
3809}
3810
3811void MacroAssembler::restore_rax(Register tmp) {
3812  if (tmp == noreg) pop(rax);
3813  else if (tmp != rax) mov(rax, tmp);
3814}
3815
3816void MacroAssembler::round_to(Register reg, int modulus) {
3817  addptr(reg, modulus - 1);
3818  andptr(reg, -modulus);
3819}
3820
3821void MacroAssembler::save_rax(Register tmp) {
3822  if (tmp == noreg) push(rax);
3823  else if (tmp != rax) mov(tmp, rax);
3824}
3825
3826// Write serialization page so VM thread can do a pseudo remote membar.
3827// We use the current thread pointer to calculate a thread specific
3828// offset to write to within the page. This minimizes bus traffic
3829// due to cache line collision.
3830void MacroAssembler::serialize_memory(Register thread, Register tmp) {
3831  movl(tmp, thread);
3832  shrl(tmp, os::get_serialize_page_shift_count());
3833  andl(tmp, (os::vm_page_size() - sizeof(int)));
3834
3835  Address index(noreg, tmp, Address::times_1);
3836  ExternalAddress page(os::get_memory_serialize_page());
3837
3838  // Size of store must match masking code above
3839  movl(as_Address(ArrayAddress(page, index)), tmp);
3840}
3841
3842// Calls to C land
3843//
3844// When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3845// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3846// has to be reset to 0. This is required to allow proper stack traversal.
3847void MacroAssembler::set_last_Java_frame(Register java_thread,
3848                                         Register last_java_sp,
3849                                         Register last_java_fp,
3850                                         address  last_java_pc) {
3851  // determine java_thread register
3852  if (!java_thread->is_valid()) {
3853    java_thread = rdi;
3854    get_thread(java_thread);
3855  }
3856  // determine last_java_sp register
3857  if (!last_java_sp->is_valid()) {
3858    last_java_sp = rsp;
3859  }
3860
3861  // last_java_fp is optional
3862
3863  if (last_java_fp->is_valid()) {
3864    movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3865  }
3866
3867  // last_java_pc is optional
3868
3869  if (last_java_pc != NULL) {
3870    lea(Address(java_thread,
3871                 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3872        InternalAddress(last_java_pc));
3873
3874  }
3875  movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3876}
3877
3878void MacroAssembler::shlptr(Register dst, int imm8) {
3879  LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3880}
3881
3882void MacroAssembler::shrptr(Register dst, int imm8) {
3883  LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3884}
3885
3886void MacroAssembler::sign_extend_byte(Register reg) {
3887  if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3888    movsbl(reg, reg); // movsxb
3889  } else {
3890    shll(reg, 24);
3891    sarl(reg, 24);
3892  }
3893}
3894
3895void MacroAssembler::sign_extend_short(Register reg) {
3896  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3897    movswl(reg, reg); // movsxw
3898  } else {
3899    shll(reg, 16);
3900    sarl(reg, 16);
3901  }
3902}
3903
3904void MacroAssembler::testl(Register dst, AddressLiteral src) {
3905  assert(reachable(src), "Address should be reachable");
3906  testl(dst, as_Address(src));
3907}
3908
3909void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3910  if (reachable(src)) {
3911    Assembler::sqrtsd(dst, as_Address(src));
3912  } else {
3913    lea(rscratch1, src);
3914    Assembler::sqrtsd(dst, Address(rscratch1, 0));
3915  }
3916}
3917
3918void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3919  if (reachable(src)) {
3920    Assembler::sqrtss(dst, as_Address(src));
3921  } else {
3922    lea(rscratch1, src);
3923    Assembler::sqrtss(dst, Address(rscratch1, 0));
3924  }
3925}
3926
3927void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3928  if (reachable(src)) {
3929    Assembler::subsd(dst, as_Address(src));
3930  } else {
3931    lea(rscratch1, src);
3932    Assembler::subsd(dst, Address(rscratch1, 0));
3933  }
3934}
3935
3936void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3937  if (reachable(src)) {
3938    Assembler::subss(dst, as_Address(src));
3939  } else {
3940    lea(rscratch1, src);
3941    Assembler::subss(dst, Address(rscratch1, 0));
3942  }
3943}
3944
3945void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3946  if (reachable(src)) {
3947    Assembler::ucomisd(dst, as_Address(src));
3948  } else {
3949    lea(rscratch1, src);
3950    Assembler::ucomisd(dst, Address(rscratch1, 0));
3951  }
3952}
3953
3954void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3955  if (reachable(src)) {
3956    Assembler::ucomiss(dst, as_Address(src));
3957  } else {
3958    lea(rscratch1, src);
3959    Assembler::ucomiss(dst, Address(rscratch1, 0));
3960  }
3961}
3962
3963void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
3964  // Used in sign-bit flipping with aligned address.
3965  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3966  if (reachable(src)) {
3967    Assembler::xorpd(dst, as_Address(src));
3968  } else {
3969    lea(rscratch1, src);
3970    Assembler::xorpd(dst, Address(rscratch1, 0));
3971  }
3972}
3973
3974void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
3975  // Used in sign-bit flipping with aligned address.
3976  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3977  if (reachable(src)) {
3978    Assembler::xorps(dst, as_Address(src));
3979  } else {
3980    lea(rscratch1, src);
3981    Assembler::xorps(dst, Address(rscratch1, 0));
3982  }
3983}
3984
3985void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3986  // Used in sign-bit flipping with aligned address.
3987  bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3988  assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3989  if (reachable(src)) {
3990    Assembler::pshufb(dst, as_Address(src));
3991  } else {
3992    lea(rscratch1, src);
3993    Assembler::pshufb(dst, Address(rscratch1, 0));
3994  }
3995}
3996
3997// AVX 3-operands instructions
3998
3999void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4000  if (reachable(src)) {
4001    vaddsd(dst, nds, as_Address(src));
4002  } else {
4003    lea(rscratch1, src);
4004    vaddsd(dst, nds, Address(rscratch1, 0));
4005  }
4006}
4007
4008void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4009  if (reachable(src)) {
4010    vaddss(dst, nds, as_Address(src));
4011  } else {
4012    lea(rscratch1, src);
4013    vaddss(dst, nds, Address(rscratch1, 0));
4014  }
4015}
4016
4017void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4018  if (reachable(src)) {
4019    vandpd(dst, nds, as_Address(src), vector256);
4020  } else {
4021    lea(rscratch1, src);
4022    vandpd(dst, nds, Address(rscratch1, 0), vector256);
4023  }
4024}
4025
4026void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4027  if (reachable(src)) {
4028    vandps(dst, nds, as_Address(src), vector256);
4029  } else {
4030    lea(rscratch1, src);
4031    vandps(dst, nds, Address(rscratch1, 0), vector256);
4032  }
4033}
4034
4035void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4036  if (reachable(src)) {
4037    vdivsd(dst, nds, as_Address(src));
4038  } else {
4039    lea(rscratch1, src);
4040    vdivsd(dst, nds, Address(rscratch1, 0));
4041  }
4042}
4043
4044void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4045  if (reachable(src)) {
4046    vdivss(dst, nds, as_Address(src));
4047  } else {
4048    lea(rscratch1, src);
4049    vdivss(dst, nds, Address(rscratch1, 0));
4050  }
4051}
4052
4053void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4054  if (reachable(src)) {
4055    vmulsd(dst, nds, as_Address(src));
4056  } else {
4057    lea(rscratch1, src);
4058    vmulsd(dst, nds, Address(rscratch1, 0));
4059  }
4060}
4061
4062void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4063  if (reachable(src)) {
4064    vmulss(dst, nds, as_Address(src));
4065  } else {
4066    lea(rscratch1, src);
4067    vmulss(dst, nds, Address(rscratch1, 0));
4068  }
4069}
4070
4071void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4072  if (reachable(src)) {
4073    vsubsd(dst, nds, as_Address(src));
4074  } else {
4075    lea(rscratch1, src);
4076    vsubsd(dst, nds, Address(rscratch1, 0));
4077  }
4078}
4079
4080void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4081  if (reachable(src)) {
4082    vsubss(dst, nds, as_Address(src));
4083  } else {
4084    lea(rscratch1, src);
4085    vsubss(dst, nds, Address(rscratch1, 0));
4086  }
4087}
4088
4089void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4090  if (reachable(src)) {
4091    vxorpd(dst, nds, as_Address(src), vector256);
4092  } else {
4093    lea(rscratch1, src);
4094    vxorpd(dst, nds, Address(rscratch1, 0), vector256);
4095  }
4096}
4097
4098void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4099  if (reachable(src)) {
4100    vxorps(dst, nds, as_Address(src), vector256);
4101  } else {
4102    lea(rscratch1, src);
4103    vxorps(dst, nds, Address(rscratch1, 0), vector256);
4104  }
4105}
4106
4107
4108//////////////////////////////////////////////////////////////////////////////////
4109#if INCLUDE_ALL_GCS
4110
4111void MacroAssembler::g1_write_barrier_pre(Register obj,
4112                                          Register pre_val,
4113                                          Register thread,
4114                                          Register tmp,
4115                                          bool tosca_live,
4116                                          bool expand_call) {
4117
4118  // If expand_call is true then we expand the call_VM_leaf macro
4119  // directly to skip generating the check by
4120  // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
4121
4122#ifdef _LP64
4123  assert(thread == r15_thread, "must be");
4124#endif // _LP64
4125
4126  Label done;
4127  Label runtime;
4128
4129  assert(pre_val != noreg, "check this code");
4130
4131  if (obj != noreg) {
4132    assert_different_registers(obj, pre_val, tmp);
4133    assert(pre_val != rax, "check this code");
4134  }
4135
4136  Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
4137                                       PtrQueue::byte_offset_of_active()));
4138  Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
4139                                       PtrQueue::byte_offset_of_index()));
4140  Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
4141                                       PtrQueue::byte_offset_of_buf()));
4142
4143
4144  // Is marking active?
4145  if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
4146    cmpl(in_progress, 0);
4147  } else {
4148    assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
4149    cmpb(in_progress, 0);
4150  }
4151  jcc(Assembler::equal, done);
4152
4153  // Do we need to load the previous value?
4154  if (obj != noreg) {
4155    load_heap_oop(pre_val, Address(obj, 0));
4156  }
4157
4158  // Is the previous value null?
4159  cmpptr(pre_val, (int32_t) NULL_WORD);
4160  jcc(Assembler::equal, done);
4161
4162  // Can we store original value in the thread's buffer?
4163  // Is index == 0?
4164  // (The index field is typed as size_t.)
4165
4166  movptr(tmp, index);                   // tmp := *index_adr
4167  cmpptr(tmp, 0);                       // tmp == 0?
4168  jcc(Assembler::equal, runtime);       // If yes, goto runtime
4169
4170  subptr(tmp, wordSize);                // tmp := tmp - wordSize
4171  movptr(index, tmp);                   // *index_adr := tmp
4172  addptr(tmp, buffer);                  // tmp := tmp + *buffer_adr
4173
4174  // Record the previous value
4175  movptr(Address(tmp, 0), pre_val);
4176  jmp(done);
4177
4178  bind(runtime);
4179  // save the live input values
4180  if(tosca_live) push(rax);
4181
4182  if (obj != noreg && obj != rax)
4183    push(obj);
4184
4185  if (pre_val != rax)
4186    push(pre_val);
4187
4188  // Calling the runtime using the regular call_VM_leaf mechanism generates
4189  // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
4190  // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
4191  //
4192  // If we care generating the pre-barrier without a frame (e.g. in the
4193  // intrinsified Reference.get() routine) then ebp might be pointing to
4194  // the caller frame and so this check will most likely fail at runtime.
4195  //
4196  // Expanding the call directly bypasses the generation of the check.
4197  // So when we do not have have a full interpreter frame on the stack
4198  // expand_call should be passed true.
4199
4200  NOT_LP64( push(thread); )
4201
4202  if (expand_call) {
4203    LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
4204    pass_arg1(this, thread);
4205    pass_arg0(this, pre_val);
4206    MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
4207  } else {
4208    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
4209  }
4210
4211  NOT_LP64( pop(thread); )
4212
4213  // save the live input values
4214  if (pre_val != rax)
4215    pop(pre_val);
4216
4217  if (obj != noreg && obj != rax)
4218    pop(obj);
4219
4220  if(tosca_live) pop(rax);
4221
4222  bind(done);
4223}
4224
4225void MacroAssembler::g1_write_barrier_post(Register store_addr,
4226                                           Register new_val,
4227                                           Register thread,
4228                                           Register tmp,
4229                                           Register tmp2) {
4230#ifdef _LP64
4231  assert(thread == r15_thread, "must be");
4232#endif // _LP64
4233
4234  Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
4235                                       PtrQueue::byte_offset_of_index()));
4236  Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
4237                                       PtrQueue::byte_offset_of_buf()));
4238
4239  BarrierSet* bs = Universe::heap()->barrier_set();
4240  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
4241  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
4242
4243  Label done;
4244  Label runtime;
4245
4246  // Does store cross heap regions?
4247
4248  movptr(tmp, store_addr);
4249  xorptr(tmp, new_val);
4250  shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
4251  jcc(Assembler::equal, done);
4252
4253  // crosses regions, storing NULL?
4254
4255  cmpptr(new_val, (int32_t) NULL_WORD);
4256  jcc(Assembler::equal, done);
4257
4258  // storing region crossing non-NULL, is card already dirty?
4259
4260  const Register card_addr = tmp;
4261  const Register cardtable = tmp2;
4262
4263  movptr(card_addr, store_addr);
4264  shrptr(card_addr, CardTableModRefBS::card_shift);
4265  // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
4266  // a valid address and therefore is not properly handled by the relocation code.
4267  movptr(cardtable, (intptr_t)ct->byte_map_base);
4268  addptr(card_addr, cardtable);
4269
4270  cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val());
4271  jcc(Assembler::equal, done);
4272
4273  membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
4274  cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
4275  jcc(Assembler::equal, done);
4276
4277
4278  // storing a region crossing, non-NULL oop, card is clean.
4279  // dirty card and log.
4280
4281  movb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
4282
4283  cmpl(queue_index, 0);
4284  jcc(Assembler::equal, runtime);
4285  subl(queue_index, wordSize);
4286  movptr(tmp2, buffer);
4287#ifdef _LP64
4288  movslq(rscratch1, queue_index);
4289  addq(tmp2, rscratch1);
4290  movq(Address(tmp2, 0), card_addr);
4291#else
4292  addl(tmp2, queue_index);
4293  movl(Address(tmp2, 0), card_addr);
4294#endif
4295  jmp(done);
4296
4297  bind(runtime);
4298  // save the live input values
4299  push(store_addr);
4300  push(new_val);
4301#ifdef _LP64
4302  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
4303#else
4304  push(thread);
4305  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
4306  pop(thread);
4307#endif
4308  pop(new_val);
4309  pop(store_addr);
4310
4311  bind(done);
4312}
4313
4314#endif // INCLUDE_ALL_GCS
4315//////////////////////////////////////////////////////////////////////////////////
4316
4317
4318void MacroAssembler::store_check(Register obj) {
4319  // Does a store check for the oop in register obj. The content of
4320  // register obj is destroyed afterwards.
4321  store_check_part_1(obj);
4322  store_check_part_2(obj);
4323}
4324
4325void MacroAssembler::store_check(Register obj, Address dst) {
4326  store_check(obj);
4327}
4328
4329
4330// split the store check operation so that other instructions can be scheduled inbetween
4331void MacroAssembler::store_check_part_1(Register obj) {
4332  BarrierSet* bs = Universe::heap()->barrier_set();
4333  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
4334  shrptr(obj, CardTableModRefBS::card_shift);
4335}
4336
4337void MacroAssembler::store_check_part_2(Register obj) {
4338  BarrierSet* bs = Universe::heap()->barrier_set();
4339  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
4340  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
4341  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
4342
4343  // The calculation for byte_map_base is as follows:
4344  // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
4345  // So this essentially converts an address to a displacement and it will
4346  // never need to be relocated. On 64bit however the value may be too
4347  // large for a 32bit displacement.
4348  intptr_t disp = (intptr_t) ct->byte_map_base;
4349  if (is_simm32(disp)) {
4350    Address cardtable(noreg, obj, Address::times_1, disp);
4351    movb(cardtable, 0);
4352  } else {
4353    // By doing it as an ExternalAddress 'disp' could be converted to a rip-relative
4354    // displacement and done in a single instruction given favorable mapping and a
4355    // smarter version of as_Address. However, 'ExternalAddress' generates a relocation
4356    // entry and that entry is not properly handled by the relocation code.
4357    AddressLiteral cardtable((address)ct->byte_map_base, relocInfo::none);
4358    Address index(noreg, obj, Address::times_1);
4359    movb(as_Address(ArrayAddress(cardtable, index)), 0);
4360  }
4361}
4362
4363void MacroAssembler::subptr(Register dst, int32_t imm32) {
4364  LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
4365}
4366
4367// Force generation of a 4 byte immediate value even if it fits into 8bit
4368void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
4369  LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
4370}
4371
4372void MacroAssembler::subptr(Register dst, Register src) {
4373  LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
4374}
4375
4376// C++ bool manipulation
4377void MacroAssembler::testbool(Register dst) {
4378  if(sizeof(bool) == 1)
4379    testb(dst, 0xff);
4380  else if(sizeof(bool) == 2) {
4381    // testw implementation needed for two byte bools
4382    ShouldNotReachHere();
4383  } else if(sizeof(bool) == 4)
4384    testl(dst, dst);
4385  else
4386    // unsupported
4387    ShouldNotReachHere();
4388}
4389
4390void MacroAssembler::testptr(Register dst, Register src) {
4391  LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
4392}
4393
4394// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4395void MacroAssembler::tlab_allocate(Register obj,
4396                                   Register var_size_in_bytes,
4397                                   int con_size_in_bytes,
4398                                   Register t1,
4399                                   Register t2,
4400                                   Label& slow_case) {
4401  assert_different_registers(obj, t1, t2);
4402  assert_different_registers(obj, var_size_in_bytes, t1);
4403  Register end = t2;
4404  Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
4405
4406  verify_tlab();
4407
4408  NOT_LP64(get_thread(thread));
4409
4410  movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
4411  if (var_size_in_bytes == noreg) {
4412    lea(end, Address(obj, con_size_in_bytes));
4413  } else {
4414    lea(end, Address(obj, var_size_in_bytes, Address::times_1));
4415  }
4416  cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
4417  jcc(Assembler::above, slow_case);
4418
4419  // update the tlab top pointer
4420  movptr(Address(thread, JavaThread::tlab_top_offset()), end);
4421
4422  // recover var_size_in_bytes if necessary
4423  if (var_size_in_bytes == end) {
4424    subptr(var_size_in_bytes, obj);
4425  }
4426  verify_tlab();
4427}
4428
4429// Preserves rbx, and rdx.
4430Register MacroAssembler::tlab_refill(Label& retry,
4431                                     Label& try_eden,
4432                                     Label& slow_case) {
4433  Register top = rax;
4434  Register t1  = rcx;
4435  Register t2  = rsi;
4436  Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
4437  assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
4438  Label do_refill, discard_tlab;
4439
4440  if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
4441    // No allocation in the shared eden.
4442    jmp(slow_case);
4443  }
4444
4445  NOT_LP64(get_thread(thread_reg));
4446
4447  movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4448  movptr(t1,  Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4449
4450  // calculate amount of free space
4451  subptr(t1, top);
4452  shrptr(t1, LogHeapWordSize);
4453
4454  // Retain tlab and allocate object in shared space if
4455  // the amount free in the tlab is too large to discard.
4456  cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
4457  jcc(Assembler::lessEqual, discard_tlab);
4458
4459  // Retain
4460  // %%% yuck as movptr...
4461  movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
4462  addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2);
4463  if (TLABStats) {
4464    // increment number of slow_allocations
4465    addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1);
4466  }
4467  jmp(try_eden);
4468
4469  bind(discard_tlab);
4470  if (TLABStats) {
4471    // increment number of refills
4472    addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1);
4473    // accumulate wastage -- t1 is amount free in tlab
4474    addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1);
4475  }
4476
4477  // if tlab is currently allocated (top or end != null) then
4478  // fill [top, end + alignment_reserve) with array object
4479  testptr(top, top);
4480  jcc(Assembler::zero, do_refill);
4481
4482  // set up the mark word
4483  movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
4484  // set the length to the remaining space
4485  subptr(t1, typeArrayOopDesc::header_size(T_INT));
4486  addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
4487  shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
4488  movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
4489  // set klass to intArrayKlass
4490  // dubious reloc why not an oop reloc?
4491  movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
4492  // store klass last.  concurrent gcs assumes klass length is valid if
4493  // klass field is not null.
4494  store_klass(top, t1);
4495
4496  movptr(t1, top);
4497  subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4498  incr_allocated_bytes(thread_reg, t1, 0);
4499
4500  // refill the tlab with an eden allocation
4501  bind(do_refill);
4502  movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
4503  shlptr(t1, LogHeapWordSize);
4504  // allocate new tlab, address returned in top
4505  eden_allocate(top, t1, 0, t2, slow_case);
4506
4507  // Check that t1 was preserved in eden_allocate.
4508#ifdef ASSERT
4509  if (UseTLAB) {
4510    Label ok;
4511    Register tsize = rsi;
4512    assert_different_registers(tsize, thread_reg, t1);
4513    push(tsize);
4514    movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
4515    shlptr(tsize, LogHeapWordSize);
4516    cmpptr(t1, tsize);
4517    jcc(Assembler::equal, ok);
4518    STOP("assert(t1 != tlab size)");
4519    should_not_reach_here();
4520
4521    bind(ok);
4522    pop(tsize);
4523  }
4524#endif
4525  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top);
4526  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top);
4527  addptr(top, t1);
4528  subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
4529  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
4530  verify_tlab();
4531  jmp(retry);
4532
4533  return thread_reg; // for use by caller
4534}
4535
4536void MacroAssembler::incr_allocated_bytes(Register thread,
4537                                          Register var_size_in_bytes,
4538                                          int con_size_in_bytes,
4539                                          Register t1) {
4540  if (!thread->is_valid()) {
4541#ifdef _LP64
4542    thread = r15_thread;
4543#else
4544    assert(t1->is_valid(), "need temp reg");
4545    thread = t1;
4546    get_thread(thread);
4547#endif
4548  }
4549
4550#ifdef _LP64
4551  if (var_size_in_bytes->is_valid()) {
4552    addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
4553  } else {
4554    addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
4555  }
4556#else
4557  if (var_size_in_bytes->is_valid()) {
4558    addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
4559  } else {
4560    addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
4561  }
4562  adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
4563#endif
4564}
4565
4566void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
4567  pusha();
4568
4569  // if we are coming from c1, xmm registers may be live
4570  int off = 0;
4571  if (UseSSE == 1)  {
4572    subptr(rsp, sizeof(jdouble)*8);
4573    movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
4574    movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
4575    movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
4576    movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
4577    movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
4578    movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
4579    movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
4580    movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
4581  } else if (UseSSE >= 2)  {
4582#ifdef COMPILER2
4583    if (MaxVectorSize > 16) {
4584      assert(UseAVX > 0, "256bit vectors are supported only with AVX");
4585      // Save upper half of YMM registes
4586      subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4587      vextractf128h(Address(rsp,  0),xmm0);
4588      vextractf128h(Address(rsp, 16),xmm1);
4589      vextractf128h(Address(rsp, 32),xmm2);
4590      vextractf128h(Address(rsp, 48),xmm3);
4591      vextractf128h(Address(rsp, 64),xmm4);
4592      vextractf128h(Address(rsp, 80),xmm5);
4593      vextractf128h(Address(rsp, 96),xmm6);
4594      vextractf128h(Address(rsp,112),xmm7);
4595#ifdef _LP64
4596      vextractf128h(Address(rsp,128),xmm8);
4597      vextractf128h(Address(rsp,144),xmm9);
4598      vextractf128h(Address(rsp,160),xmm10);
4599      vextractf128h(Address(rsp,176),xmm11);
4600      vextractf128h(Address(rsp,192),xmm12);
4601      vextractf128h(Address(rsp,208),xmm13);
4602      vextractf128h(Address(rsp,224),xmm14);
4603      vextractf128h(Address(rsp,240),xmm15);
4604#endif
4605    }
4606#endif
4607    // Save whole 128bit (16 bytes) XMM regiters
4608    subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4609    movdqu(Address(rsp,off++*16),xmm0);
4610    movdqu(Address(rsp,off++*16),xmm1);
4611    movdqu(Address(rsp,off++*16),xmm2);
4612    movdqu(Address(rsp,off++*16),xmm3);
4613    movdqu(Address(rsp,off++*16),xmm4);
4614    movdqu(Address(rsp,off++*16),xmm5);
4615    movdqu(Address(rsp,off++*16),xmm6);
4616    movdqu(Address(rsp,off++*16),xmm7);
4617#ifdef _LP64
4618    movdqu(Address(rsp,off++*16),xmm8);
4619    movdqu(Address(rsp,off++*16),xmm9);
4620    movdqu(Address(rsp,off++*16),xmm10);
4621    movdqu(Address(rsp,off++*16),xmm11);
4622    movdqu(Address(rsp,off++*16),xmm12);
4623    movdqu(Address(rsp,off++*16),xmm13);
4624    movdqu(Address(rsp,off++*16),xmm14);
4625    movdqu(Address(rsp,off++*16),xmm15);
4626#endif
4627  }
4628
4629  // Preserve registers across runtime call
4630  int incoming_argument_and_return_value_offset = -1;
4631  if (num_fpu_regs_in_use > 1) {
4632    // Must preserve all other FPU regs (could alternatively convert
4633    // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
4634    // FPU state, but can not trust C compiler)
4635    NEEDS_CLEANUP;
4636    // NOTE that in this case we also push the incoming argument(s) to
4637    // the stack and restore it later; we also use this stack slot to
4638    // hold the return value from dsin, dcos etc.
4639    for (int i = 0; i < num_fpu_regs_in_use; i++) {
4640      subptr(rsp, sizeof(jdouble));
4641      fstp_d(Address(rsp, 0));
4642    }
4643    incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
4644    for (int i = nb_args-1; i >= 0; i--) {
4645      fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
4646    }
4647  }
4648
4649  subptr(rsp, nb_args*sizeof(jdouble));
4650  for (int i = 0; i < nb_args; i++) {
4651    fstp_d(Address(rsp, i*sizeof(jdouble)));
4652  }
4653
4654#ifdef _LP64
4655  if (nb_args > 0) {
4656    movdbl(xmm0, Address(rsp, 0));
4657  }
4658  if (nb_args > 1) {
4659    movdbl(xmm1, Address(rsp, sizeof(jdouble)));
4660  }
4661  assert(nb_args <= 2, "unsupported number of args");
4662#endif // _LP64
4663
4664  // NOTE: we must not use call_VM_leaf here because that requires a
4665  // complete interpreter frame in debug mode -- same bug as 4387334
4666  // MacroAssembler::call_VM_leaf_base is perfectly safe and will
4667  // do proper 64bit abi
4668
4669  NEEDS_CLEANUP;
4670  // Need to add stack banging before this runtime call if it needs to
4671  // be taken; however, there is no generic stack banging routine at
4672  // the MacroAssembler level
4673
4674  MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
4675
4676#ifdef _LP64
4677  movsd(Address(rsp, 0), xmm0);
4678  fld_d(Address(rsp, 0));
4679#endif // _LP64
4680  addptr(rsp, sizeof(jdouble) * nb_args);
4681  if (num_fpu_regs_in_use > 1) {
4682    // Must save return value to stack and then restore entire FPU
4683    // stack except incoming arguments
4684    fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
4685    for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
4686      fld_d(Address(rsp, 0));
4687      addptr(rsp, sizeof(jdouble));
4688    }
4689    fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
4690    addptr(rsp, sizeof(jdouble) * nb_args);
4691  }
4692
4693  off = 0;
4694  if (UseSSE == 1)  {
4695    movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
4696    movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
4697    movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
4698    movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
4699    movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
4700    movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
4701    movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
4702    movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
4703    addptr(rsp, sizeof(jdouble)*8);
4704  } else if (UseSSE >= 2)  {
4705    // Restore whole 128bit (16 bytes) XMM regiters
4706    movdqu(xmm0, Address(rsp,off++*16));
4707    movdqu(xmm1, Address(rsp,off++*16));
4708    movdqu(xmm2, Address(rsp,off++*16));
4709    movdqu(xmm3, Address(rsp,off++*16));
4710    movdqu(xmm4, Address(rsp,off++*16));
4711    movdqu(xmm5, Address(rsp,off++*16));
4712    movdqu(xmm6, Address(rsp,off++*16));
4713    movdqu(xmm7, Address(rsp,off++*16));
4714#ifdef _LP64
4715    movdqu(xmm8, Address(rsp,off++*16));
4716    movdqu(xmm9, Address(rsp,off++*16));
4717    movdqu(xmm10, Address(rsp,off++*16));
4718    movdqu(xmm11, Address(rsp,off++*16));
4719    movdqu(xmm12, Address(rsp,off++*16));
4720    movdqu(xmm13, Address(rsp,off++*16));
4721    movdqu(xmm14, Address(rsp,off++*16));
4722    movdqu(xmm15, Address(rsp,off++*16));
4723#endif
4724    addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4725#ifdef COMPILER2
4726    if (MaxVectorSize > 16) {
4727      // Restore upper half of YMM registes.
4728      vinsertf128h(xmm0, Address(rsp,  0));
4729      vinsertf128h(xmm1, Address(rsp, 16));
4730      vinsertf128h(xmm2, Address(rsp, 32));
4731      vinsertf128h(xmm3, Address(rsp, 48));
4732      vinsertf128h(xmm4, Address(rsp, 64));
4733      vinsertf128h(xmm5, Address(rsp, 80));
4734      vinsertf128h(xmm6, Address(rsp, 96));
4735      vinsertf128h(xmm7, Address(rsp,112));
4736#ifdef _LP64
4737      vinsertf128h(xmm8, Address(rsp,128));
4738      vinsertf128h(xmm9, Address(rsp,144));
4739      vinsertf128h(xmm10, Address(rsp,160));
4740      vinsertf128h(xmm11, Address(rsp,176));
4741      vinsertf128h(xmm12, Address(rsp,192));
4742      vinsertf128h(xmm13, Address(rsp,208));
4743      vinsertf128h(xmm14, Address(rsp,224));
4744      vinsertf128h(xmm15, Address(rsp,240));
4745#endif
4746      addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4747    }
4748#endif
4749  }
4750  popa();
4751}
4752
4753static const double     pi_4 =  0.7853981633974483;
4754
4755void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
4756  // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
4757  // was attempted in this code; unfortunately it appears that the
4758  // switch to 80-bit precision and back causes this to be
4759  // unprofitable compared with simply performing a runtime call if
4760  // the argument is out of the (-pi/4, pi/4) range.
4761
4762  Register tmp = noreg;
4763  if (!VM_Version::supports_cmov()) {
4764    // fcmp needs a temporary so preserve rbx,
4765    tmp = rbx;
4766    push(tmp);
4767  }
4768
4769  Label slow_case, done;
4770
4771  ExternalAddress pi4_adr = (address)&pi_4;
4772  if (reachable(pi4_adr)) {
4773    // x ?<= pi/4
4774    fld_d(pi4_adr);
4775    fld_s(1);                // Stack:  X  PI/4  X
4776    fabs();                  // Stack: |X| PI/4  X
4777    fcmp(tmp);
4778    jcc(Assembler::above, slow_case);
4779
4780    // fastest case: -pi/4 <= x <= pi/4
4781    switch(trig) {
4782    case 's':
4783      fsin();
4784      break;
4785    case 'c':
4786      fcos();
4787      break;
4788    case 't':
4789      ftan();
4790      break;
4791    default:
4792      assert(false, "bad intrinsic");
4793      break;
4794    }
4795    jmp(done);
4796  }
4797
4798  // slow case: runtime call
4799  bind(slow_case);
4800
4801  switch(trig) {
4802  case 's':
4803    {
4804      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
4805    }
4806    break;
4807  case 'c':
4808    {
4809      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
4810    }
4811    break;
4812  case 't':
4813    {
4814      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
4815    }
4816    break;
4817  default:
4818    assert(false, "bad intrinsic");
4819    break;
4820  }
4821
4822  // Come here with result in F-TOS
4823  bind(done);
4824
4825  if (tmp != noreg) {
4826    pop(tmp);
4827  }
4828}
4829
4830
4831// Look up the method for a megamorphic invokeinterface call.
4832// The target method is determined by <intf_klass, itable_index>.
4833// The receiver klass is in recv_klass.
4834// On success, the result will be in method_result, and execution falls through.
4835// On failure, execution transfers to the given label.
4836void MacroAssembler::lookup_interface_method(Register recv_klass,
4837                                             Register intf_klass,
4838                                             RegisterOrConstant itable_index,
4839                                             Register method_result,
4840                                             Register scan_temp,
4841                                             Label& L_no_such_interface) {
4842  assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
4843  assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4844         "caller must use same register for non-constant itable index as for method");
4845
4846  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4847  int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
4848  int itentry_off = itableMethodEntry::method_offset_in_bytes();
4849  int scan_step   = itableOffsetEntry::size() * wordSize;
4850  int vte_size    = vtableEntry::size() * wordSize;
4851  Address::ScaleFactor times_vte_scale = Address::times_ptr;
4852  assert(vte_size == wordSize, "else adjust times_vte_scale");
4853
4854  movl(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
4855
4856  // %%% Could store the aligned, prescaled offset in the klassoop.
4857  lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4858  if (HeapWordsPerLong > 1) {
4859    // Round up to align_object_offset boundary
4860    // see code for InstanceKlass::start_of_itable!
4861    round_to(scan_temp, BytesPerLong);
4862  }
4863
4864  // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4865  assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4866  lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4867
4868  // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
4869  //   if (scan->interface() == intf) {
4870  //     result = (klass + scan->offset() + itable_index);
4871  //   }
4872  // }
4873  Label search, found_method;
4874
4875  for (int peel = 1; peel >= 0; peel--) {
4876    movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
4877    cmpptr(intf_klass, method_result);
4878
4879    if (peel) {
4880      jccb(Assembler::equal, found_method);
4881    } else {
4882      jccb(Assembler::notEqual, search);
4883      // (invert the test to fall through to found_method...)
4884    }
4885
4886    if (!peel)  break;
4887
4888    bind(search);
4889
4890    // Check that the previous entry is non-null.  A null entry means that
4891    // the receiver class doesn't implement the interface, and wasn't the
4892    // same as when the caller was compiled.
4893    testptr(method_result, method_result);
4894    jcc(Assembler::zero, L_no_such_interface);
4895    addptr(scan_temp, scan_step);
4896  }
4897
4898  bind(found_method);
4899
4900  // Got a hit.
4901  movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
4902  movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4903}
4904
4905
4906// virtual method calling
4907void MacroAssembler::lookup_virtual_method(Register recv_klass,
4908                                           RegisterOrConstant vtable_index,
4909                                           Register method_result) {
4910  const int base = InstanceKlass::vtable_start_offset() * wordSize;
4911  assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4912  Address vtable_entry_addr(recv_klass,
4913                            vtable_index, Address::times_ptr,
4914                            base + vtableEntry::method_offset_in_bytes());
4915  movptr(method_result, vtable_entry_addr);
4916}
4917
4918
4919void MacroAssembler::check_klass_subtype(Register sub_klass,
4920                           Register super_klass,
4921                           Register temp_reg,
4922                           Label& L_success) {
4923  Label L_failure;
4924  check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
4925  check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
4926  bind(L_failure);
4927}
4928
4929
4930void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4931                                                   Register super_klass,
4932                                                   Register temp_reg,
4933                                                   Label* L_success,
4934                                                   Label* L_failure,
4935                                                   Label* L_slow_path,
4936                                        RegisterOrConstant super_check_offset) {
4937  assert_different_registers(sub_klass, super_klass, temp_reg);
4938  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4939  if (super_check_offset.is_register()) {
4940    assert_different_registers(sub_klass, super_klass,
4941                               super_check_offset.as_register());
4942  } else if (must_load_sco) {
4943    assert(temp_reg != noreg, "supply either a temp or a register offset");
4944  }
4945
4946  Label L_fallthrough;
4947  int label_nulls = 0;
4948  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4949  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4950  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
4951  assert(label_nulls <= 1, "at most one NULL in the batch");
4952
4953  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4954  int sco_offset = in_bytes(Klass::super_check_offset_offset());
4955  Address super_check_offset_addr(super_klass, sco_offset);
4956
4957  // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4958  // range of a jccb.  If this routine grows larger, reconsider at
4959  // least some of these.
4960#define local_jcc(assembler_cond, label)                                \
4961  if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
4962  else                             jcc( assembler_cond, label) /*omit semi*/
4963
4964  // Hacked jmp, which may only be used just before L_fallthrough.
4965#define final_jmp(label)                                                \
4966  if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4967  else                            jmp(label)                /*omit semi*/
4968
4969  // If the pointers are equal, we are done (e.g., String[] elements).
4970  // This self-check enables sharing of secondary supertype arrays among
4971  // non-primary types such as array-of-interface.  Otherwise, each such
4972  // type would need its own customized SSA.
4973  // We move this check to the front of the fast path because many
4974  // type checks are in fact trivially successful in this manner,
4975  // so we get a nicely predicted branch right at the start of the check.
4976  cmpptr(sub_klass, super_klass);
4977  local_jcc(Assembler::equal, *L_success);
4978
4979  // Check the supertype display:
4980  if (must_load_sco) {
4981    // Positive movl does right thing on LP64.
4982    movl(temp_reg, super_check_offset_addr);
4983    super_check_offset = RegisterOrConstant(temp_reg);
4984  }
4985  Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4986  cmpptr(super_klass, super_check_addr); // load displayed supertype
4987
4988  // This check has worked decisively for primary supers.
4989  // Secondary supers are sought in the super_cache ('super_cache_addr').
4990  // (Secondary supers are interfaces and very deeply nested subtypes.)
4991  // This works in the same check above because of a tricky aliasing
4992  // between the super_cache and the primary super display elements.
4993  // (The 'super_check_addr' can address either, as the case requires.)
4994  // Note that the cache is updated below if it does not help us find
4995  // what we need immediately.
4996  // So if it was a primary super, we can just fail immediately.
4997  // Otherwise, it's the slow path for us (no success at this point).
4998
4999  if (super_check_offset.is_register()) {
5000    local_jcc(Assembler::equal, *L_success);
5001    cmpl(super_check_offset.as_register(), sc_offset);
5002    if (L_failure == &L_fallthrough) {
5003      local_jcc(Assembler::equal, *L_slow_path);
5004    } else {
5005      local_jcc(Assembler::notEqual, *L_failure);
5006      final_jmp(*L_slow_path);
5007    }
5008  } else if (super_check_offset.as_constant() == sc_offset) {
5009    // Need a slow path; fast failure is impossible.
5010    if (L_slow_path == &L_fallthrough) {
5011      local_jcc(Assembler::equal, *L_success);
5012    } else {
5013      local_jcc(Assembler::notEqual, *L_slow_path);
5014      final_jmp(*L_success);
5015    }
5016  } else {
5017    // No slow path; it's a fast decision.
5018    if (L_failure == &L_fallthrough) {
5019      local_jcc(Assembler::equal, *L_success);
5020    } else {
5021      local_jcc(Assembler::notEqual, *L_failure);
5022      final_jmp(*L_success);
5023    }
5024  }
5025
5026  bind(L_fallthrough);
5027
5028#undef local_jcc
5029#undef final_jmp
5030}
5031
5032
5033void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
5034                                                   Register super_klass,
5035                                                   Register temp_reg,
5036                                                   Register temp2_reg,
5037                                                   Label* L_success,
5038                                                   Label* L_failure,
5039                                                   bool set_cond_codes) {
5040  assert_different_registers(sub_klass, super_klass, temp_reg);
5041  if (temp2_reg != noreg)
5042    assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
5043#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
5044
5045  Label L_fallthrough;
5046  int label_nulls = 0;
5047  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
5048  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
5049  assert(label_nulls <= 1, "at most one NULL in the batch");
5050
5051  // a couple of useful fields in sub_klass:
5052  int ss_offset = in_bytes(Klass::secondary_supers_offset());
5053  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
5054  Address secondary_supers_addr(sub_klass, ss_offset);
5055  Address super_cache_addr(     sub_klass, sc_offset);
5056
5057  // Do a linear scan of the secondary super-klass chain.
5058  // This code is rarely used, so simplicity is a virtue here.
5059  // The repne_scan instruction uses fixed registers, which we must spill.
5060  // Don't worry too much about pre-existing connections with the input regs.
5061
5062  assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
5063  assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
5064
5065  // Get super_klass value into rax (even if it was in rdi or rcx).
5066  bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
5067  if (super_klass != rax || UseCompressedOops) {
5068    if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
5069    mov(rax, super_klass);
5070  }
5071  if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
5072  if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
5073
5074#ifndef PRODUCT
5075  int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
5076  ExternalAddress pst_counter_addr((address) pst_counter);
5077  NOT_LP64(  incrementl(pst_counter_addr) );
5078  LP64_ONLY( lea(rcx, pst_counter_addr) );
5079  LP64_ONLY( incrementl(Address(rcx, 0)) );
5080#endif //PRODUCT
5081
5082  // We will consult the secondary-super array.
5083  movptr(rdi, secondary_supers_addr);
5084  // Load the array length.  (Positive movl does right thing on LP64.)
5085  movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
5086  // Skip to start of data.
5087  addptr(rdi, Array<Klass*>::base_offset_in_bytes());
5088
5089  // Scan RCX words at [RDI] for an occurrence of RAX.
5090  // Set NZ/Z based on last compare.
5091  // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
5092  // not change flags (only scas instruction which is repeated sets flags).
5093  // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
5094
5095    testptr(rax,rax); // Set Z = 0
5096    repne_scan();
5097
5098  // Unspill the temp. registers:
5099  if (pushed_rdi)  pop(rdi);
5100  if (pushed_rcx)  pop(rcx);
5101  if (pushed_rax)  pop(rax);
5102
5103  if (set_cond_codes) {
5104    // Special hack for the AD files:  rdi is guaranteed non-zero.
5105    assert(!pushed_rdi, "rdi must be left non-NULL");
5106    // Also, the condition codes are properly set Z/NZ on succeed/failure.
5107  }
5108
5109  if (L_failure == &L_fallthrough)
5110        jccb(Assembler::notEqual, *L_failure);
5111  else  jcc(Assembler::notEqual, *L_failure);
5112
5113  // Success.  Cache the super we found and proceed in triumph.
5114  movptr(super_cache_addr, super_klass);
5115
5116  if (L_success != &L_fallthrough) {
5117    jmp(*L_success);
5118  }
5119
5120#undef IS_A_TEMP
5121
5122  bind(L_fallthrough);
5123}
5124
5125
5126void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
5127  if (VM_Version::supports_cmov()) {
5128    cmovl(cc, dst, src);
5129  } else {
5130    Label L;
5131    jccb(negate_condition(cc), L);
5132    movl(dst, src);
5133    bind(L);
5134  }
5135}
5136
5137void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
5138  if (VM_Version::supports_cmov()) {
5139    cmovl(cc, dst, src);
5140  } else {
5141    Label L;
5142    jccb(negate_condition(cc), L);
5143    movl(dst, src);
5144    bind(L);
5145  }
5146}
5147
5148void MacroAssembler::verify_oop(Register reg, const char* s) {
5149  if (!VerifyOops) return;
5150
5151  // Pass register number to verify_oop_subroutine
5152  const char* b = NULL;
5153  {
5154    ResourceMark rm;
5155    stringStream ss;
5156    ss.print("verify_oop: %s: %s", reg->name(), s);
5157    b = code_string(ss.as_string());
5158  }
5159  BLOCK_COMMENT("verify_oop {");
5160#ifdef _LP64
5161  push(rscratch1);                    // save r10, trashed by movptr()
5162#endif
5163  push(rax);                          // save rax,
5164  push(reg);                          // pass register argument
5165  ExternalAddress buffer((address) b);
5166  // avoid using pushptr, as it modifies scratch registers
5167  // and our contract is not to modify anything
5168  movptr(rax, buffer.addr());
5169  push(rax);
5170  // call indirectly to solve generation ordering problem
5171  movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5172  call(rax);
5173  // Caller pops the arguments (oop, message) and restores rax, r10
5174  BLOCK_COMMENT("} verify_oop");
5175}
5176
5177
5178RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
5179                                                      Register tmp,
5180                                                      int offset) {
5181  intptr_t value = *delayed_value_addr;
5182  if (value != 0)
5183    return RegisterOrConstant(value + offset);
5184
5185  // load indirectly to solve generation ordering problem
5186  movptr(tmp, ExternalAddress((address) delayed_value_addr));
5187
5188#ifdef ASSERT
5189  { Label L;
5190    testptr(tmp, tmp);
5191    if (WizardMode) {
5192      const char* buf = NULL;
5193      {
5194        ResourceMark rm;
5195        stringStream ss;
5196        ss.print("DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]);
5197        buf = code_string(ss.as_string());
5198      }
5199      jcc(Assembler::notZero, L);
5200      STOP(buf);
5201    } else {
5202      jccb(Assembler::notZero, L);
5203      hlt();
5204    }
5205    bind(L);
5206  }
5207#endif
5208
5209  if (offset != 0)
5210    addptr(tmp, offset);
5211
5212  return RegisterOrConstant(tmp);
5213}
5214
5215
5216Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
5217                                         int extra_slot_offset) {
5218  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
5219  int stackElementSize = Interpreter::stackElementSize;
5220  int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
5221#ifdef ASSERT
5222  int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
5223  assert(offset1 - offset == stackElementSize, "correct arithmetic");
5224#endif
5225  Register             scale_reg    = noreg;
5226  Address::ScaleFactor scale_factor = Address::no_scale;
5227  if (arg_slot.is_constant()) {
5228    offset += arg_slot.as_constant() * stackElementSize;
5229  } else {
5230    scale_reg    = arg_slot.as_register();
5231    scale_factor = Address::times(stackElementSize);
5232  }
5233  offset += wordSize;           // return PC is on stack
5234  return Address(rsp, scale_reg, scale_factor, offset);
5235}
5236
5237
5238void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
5239  if (!VerifyOops) return;
5240
5241  // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
5242  // Pass register number to verify_oop_subroutine
5243  const char* b = NULL;
5244  {
5245    ResourceMark rm;
5246    stringStream ss;
5247    ss.print("verify_oop_addr: %s", s);
5248    b = code_string(ss.as_string());
5249  }
5250#ifdef _LP64
5251  push(rscratch1);                    // save r10, trashed by movptr()
5252#endif
5253  push(rax);                          // save rax,
5254  // addr may contain rsp so we will have to adjust it based on the push
5255  // we just did (and on 64 bit we do two pushes)
5256  // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
5257  // stores rax into addr which is backwards of what was intended.
5258  if (addr.uses(rsp)) {
5259    lea(rax, addr);
5260    pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
5261  } else {
5262    pushptr(addr);
5263  }
5264
5265  ExternalAddress buffer((address) b);
5266  // pass msg argument
5267  // avoid using pushptr, as it modifies scratch registers
5268  // and our contract is not to modify anything
5269  movptr(rax, buffer.addr());
5270  push(rax);
5271
5272  // call indirectly to solve generation ordering problem
5273  movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5274  call(rax);
5275  // Caller pops the arguments (addr, message) and restores rax, r10.
5276}
5277
5278void MacroAssembler::verify_tlab() {
5279#ifdef ASSERT
5280  if (UseTLAB && VerifyOops) {
5281    Label next, ok;
5282    Register t1 = rsi;
5283    Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
5284
5285    push(t1);
5286    NOT_LP64(push(thread_reg));
5287    NOT_LP64(get_thread(thread_reg));
5288
5289    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5290    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
5291    jcc(Assembler::aboveEqual, next);
5292    STOP("assert(top >= start)");
5293    should_not_reach_here();
5294
5295    bind(next);
5296    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
5297    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5298    jcc(Assembler::aboveEqual, ok);
5299    STOP("assert(top <= end)");
5300    should_not_reach_here();
5301
5302    bind(ok);
5303    NOT_LP64(pop(thread_reg));
5304    pop(t1);
5305  }
5306#endif
5307}
5308
5309class ControlWord {
5310 public:
5311  int32_t _value;
5312
5313  int  rounding_control() const        { return  (_value >> 10) & 3      ; }
5314  int  precision_control() const       { return  (_value >>  8) & 3      ; }
5315  bool precision() const               { return ((_value >>  5) & 1) != 0; }
5316  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
5317  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
5318  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
5319  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
5320  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
5321
5322  void print() const {
5323    // rounding control
5324    const char* rc;
5325    switch (rounding_control()) {
5326      case 0: rc = "round near"; break;
5327      case 1: rc = "round down"; break;
5328      case 2: rc = "round up  "; break;
5329      case 3: rc = "chop      "; break;
5330    };
5331    // precision control
5332    const char* pc;
5333    switch (precision_control()) {
5334      case 0: pc = "24 bits "; break;
5335      case 1: pc = "reserved"; break;
5336      case 2: pc = "53 bits "; break;
5337      case 3: pc = "64 bits "; break;
5338    };
5339    // flags
5340    char f[9];
5341    f[0] = ' ';
5342    f[1] = ' ';
5343    f[2] = (precision   ()) ? 'P' : 'p';
5344    f[3] = (underflow   ()) ? 'U' : 'u';
5345    f[4] = (overflow    ()) ? 'O' : 'o';
5346    f[5] = (zero_divide ()) ? 'Z' : 'z';
5347    f[6] = (denormalized()) ? 'D' : 'd';
5348    f[7] = (invalid     ()) ? 'I' : 'i';
5349    f[8] = '\x0';
5350    // output
5351    printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
5352  }
5353
5354};
5355
5356class StatusWord {
5357 public:
5358  int32_t _value;
5359
5360  bool busy() const                    { return ((_value >> 15) & 1) != 0; }
5361  bool C3() const                      { return ((_value >> 14) & 1) != 0; }
5362  bool C2() const                      { return ((_value >> 10) & 1) != 0; }
5363  bool C1() const                      { return ((_value >>  9) & 1) != 0; }
5364  bool C0() const                      { return ((_value >>  8) & 1) != 0; }
5365  int  top() const                     { return  (_value >> 11) & 7      ; }
5366  bool error_status() const            { return ((_value >>  7) & 1) != 0; }
5367  bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
5368  bool precision() const               { return ((_value >>  5) & 1) != 0; }
5369  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
5370  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
5371  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
5372  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
5373  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
5374
5375  void print() const {
5376    // condition codes
5377    char c[5];
5378    c[0] = (C3()) ? '3' : '-';
5379    c[1] = (C2()) ? '2' : '-';
5380    c[2] = (C1()) ? '1' : '-';
5381    c[3] = (C0()) ? '0' : '-';
5382    c[4] = '\x0';
5383    // flags
5384    char f[9];
5385    f[0] = (error_status()) ? 'E' : '-';
5386    f[1] = (stack_fault ()) ? 'S' : '-';
5387    f[2] = (precision   ()) ? 'P' : '-';
5388    f[3] = (underflow   ()) ? 'U' : '-';
5389    f[4] = (overflow    ()) ? 'O' : '-';
5390    f[5] = (zero_divide ()) ? 'Z' : '-';
5391    f[6] = (denormalized()) ? 'D' : '-';
5392    f[7] = (invalid     ()) ? 'I' : '-';
5393    f[8] = '\x0';
5394    // output
5395    printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
5396  }
5397
5398};
5399
5400class TagWord {
5401 public:
5402  int32_t _value;
5403
5404  int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
5405
5406  void print() const {
5407    printf("%04x", _value & 0xFFFF);
5408  }
5409
5410};
5411
5412class FPU_Register {
5413 public:
5414  int32_t _m0;
5415  int32_t _m1;
5416  int16_t _ex;
5417
5418  bool is_indefinite() const           {
5419    return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
5420  }
5421
5422  void print() const {
5423    char  sign = (_ex < 0) ? '-' : '+';
5424    const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
5425    printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
5426  };
5427
5428};
5429
5430class FPU_State {
5431 public:
5432  enum {
5433    register_size       = 10,
5434    number_of_registers =  8,
5435    register_mask       =  7
5436  };
5437
5438  ControlWord  _control_word;
5439  StatusWord   _status_word;
5440  TagWord      _tag_word;
5441  int32_t      _error_offset;
5442  int32_t      _error_selector;
5443  int32_t      _data_offset;
5444  int32_t      _data_selector;
5445  int8_t       _register[register_size * number_of_registers];
5446
5447  int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5448  FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
5449
5450  const char* tag_as_string(int tag) const {
5451    switch (tag) {
5452      case 0: return "valid";
5453      case 1: return "zero";
5454      case 2: return "special";
5455      case 3: return "empty";
5456    }
5457    ShouldNotReachHere();
5458    return NULL;
5459  }
5460
5461  void print() const {
5462    // print computation registers
5463    { int t = _status_word.top();
5464      for (int i = 0; i < number_of_registers; i++) {
5465        int j = (i - t) & register_mask;
5466        printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5467        st(j)->print();
5468        printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5469      }
5470    }
5471    printf("\n");
5472    // print control registers
5473    printf("ctrl = "); _control_word.print(); printf("\n");
5474    printf("stat = "); _status_word .print(); printf("\n");
5475    printf("tags = "); _tag_word    .print(); printf("\n");
5476  }
5477
5478};
5479
5480class Flag_Register {
5481 public:
5482  int32_t _value;
5483
5484  bool overflow() const                { return ((_value >> 11) & 1) != 0; }
5485  bool direction() const               { return ((_value >> 10) & 1) != 0; }
5486  bool sign() const                    { return ((_value >>  7) & 1) != 0; }
5487  bool zero() const                    { return ((_value >>  6) & 1) != 0; }
5488  bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
5489  bool parity() const                  { return ((_value >>  2) & 1) != 0; }
5490  bool carry() const                   { return ((_value >>  0) & 1) != 0; }
5491
5492  void print() const {
5493    // flags
5494    char f[8];
5495    f[0] = (overflow       ()) ? 'O' : '-';
5496    f[1] = (direction      ()) ? 'D' : '-';
5497    f[2] = (sign           ()) ? 'S' : '-';
5498    f[3] = (zero           ()) ? 'Z' : '-';
5499    f[4] = (auxiliary_carry()) ? 'A' : '-';
5500    f[5] = (parity         ()) ? 'P' : '-';
5501    f[6] = (carry          ()) ? 'C' : '-';
5502    f[7] = '\x0';
5503    // output
5504    printf("%08x  flags = %s", _value, f);
5505  }
5506
5507};
5508
5509class IU_Register {
5510 public:
5511  int32_t _value;
5512
5513  void print() const {
5514    printf("%08x  %11d", _value, _value);
5515  }
5516
5517};
5518
5519class IU_State {
5520 public:
5521  Flag_Register _eflags;
5522  IU_Register   _rdi;
5523  IU_Register   _rsi;
5524  IU_Register   _rbp;
5525  IU_Register   _rsp;
5526  IU_Register   _rbx;
5527  IU_Register   _rdx;
5528  IU_Register   _rcx;
5529  IU_Register   _rax;
5530
5531  void print() const {
5532    // computation registers
5533    printf("rax,  = "); _rax.print(); printf("\n");
5534    printf("rbx,  = "); _rbx.print(); printf("\n");
5535    printf("rcx  = "); _rcx.print(); printf("\n");
5536    printf("rdx  = "); _rdx.print(); printf("\n");
5537    printf("rdi  = "); _rdi.print(); printf("\n");
5538    printf("rsi  = "); _rsi.print(); printf("\n");
5539    printf("rbp,  = "); _rbp.print(); printf("\n");
5540    printf("rsp  = "); _rsp.print(); printf("\n");
5541    printf("\n");
5542    // control registers
5543    printf("flgs = "); _eflags.print(); printf("\n");
5544  }
5545};
5546
5547
5548class CPU_State {
5549 public:
5550  FPU_State _fpu_state;
5551  IU_State  _iu_state;
5552
5553  void print() const {
5554    printf("--------------------------------------------------\n");
5555    _iu_state .print();
5556    printf("\n");
5557    _fpu_state.print();
5558    printf("--------------------------------------------------\n");
5559  }
5560
5561};
5562
5563
5564static void _print_CPU_state(CPU_State* state) {
5565  state->print();
5566};
5567
5568
5569void MacroAssembler::print_CPU_state() {
5570  push_CPU_state();
5571  push(rsp);                // pass CPU state
5572  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5573  addptr(rsp, wordSize);       // discard argument
5574  pop_CPU_state();
5575}
5576
5577
5578static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5579  static int counter = 0;
5580  FPU_State* fs = &state->_fpu_state;
5581  counter++;
5582  // For leaf calls, only verify that the top few elements remain empty.
5583  // We only need 1 empty at the top for C2 code.
5584  if( stack_depth < 0 ) {
5585    if( fs->tag_for_st(7) != 3 ) {
5586      printf("FPR7 not empty\n");
5587      state->print();
5588      assert(false, "error");
5589      return false;
5590    }
5591    return true;                // All other stack states do not matter
5592  }
5593
5594  assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
5595         "bad FPU control word");
5596
5597  // compute stack depth
5598  int i = 0;
5599  while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
5600  int d = i;
5601  while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5602  // verify findings
5603  if (i != FPU_State::number_of_registers) {
5604    // stack not contiguous
5605    printf("%s: stack not contiguous at ST%d\n", s, i);
5606    state->print();
5607    assert(false, "error");
5608    return false;
5609  }
5610  // check if computed stack depth corresponds to expected stack depth
5611  if (stack_depth < 0) {
5612    // expected stack depth is -stack_depth or less
5613    if (d > -stack_depth) {
5614      // too many elements on the stack
5615      printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5616      state->print();
5617      assert(false, "error");
5618      return false;
5619    }
5620  } else {
5621    // expected stack depth is stack_depth
5622    if (d != stack_depth) {
5623      // wrong stack depth
5624      printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5625      state->print();
5626      assert(false, "error");
5627      return false;
5628    }
5629  }
5630  // everything is cool
5631  return true;
5632}
5633
5634
5635void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5636  if (!VerifyFPU) return;
5637  push_CPU_state();
5638  push(rsp);                // pass CPU state
5639  ExternalAddress msg((address) s);
5640  // pass message string s
5641  pushptr(msg.addr());
5642  push(stack_depth);        // pass stack depth
5643  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5644  addptr(rsp, 3 * wordSize);   // discard arguments
5645  // check for error
5646  { Label L;
5647    testl(rax, rax);
5648    jcc(Assembler::notZero, L);
5649    int3();                  // break if error condition
5650    bind(L);
5651  }
5652  pop_CPU_state();
5653}
5654
5655void MacroAssembler::restore_cpu_control_state_after_jni() {
5656  // Either restore the MXCSR register after returning from the JNI Call
5657  // or verify that it wasn't changed (with -Xcheck:jni flag).
5658  if (VM_Version::supports_sse()) {
5659    if (RestoreMXCSROnJNICalls) {
5660      ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
5661    } else if (CheckJNICalls) {
5662      call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5663    }
5664  }
5665  if (VM_Version::supports_avx()) {
5666    // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5667    vzeroupper();
5668  }
5669
5670#ifndef _LP64
5671  // Either restore the x87 floating pointer control word after returning
5672  // from the JNI call or verify that it wasn't changed.
5673  if (CheckJNICalls) {
5674    call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5675  }
5676#endif // _LP64
5677}
5678
5679
5680void MacroAssembler::load_klass(Register dst, Register src) {
5681#ifdef _LP64
5682  if (UseCompressedClassPointers) {
5683    movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5684    decode_klass_not_null(dst);
5685  } else
5686#endif
5687    movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5688}
5689
5690void MacroAssembler::load_prototype_header(Register dst, Register src) {
5691  load_klass(dst, src);
5692  movptr(dst, Address(dst, Klass::prototype_header_offset()));
5693}
5694
5695void MacroAssembler::store_klass(Register dst, Register src) {
5696#ifdef _LP64
5697  if (UseCompressedClassPointers) {
5698    encode_klass_not_null(src);
5699    movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5700  } else
5701#endif
5702    movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5703}
5704
5705void MacroAssembler::load_heap_oop(Register dst, Address src) {
5706#ifdef _LP64
5707  // FIXME: Must change all places where we try to load the klass.
5708  if (UseCompressedOops) {
5709    movl(dst, src);
5710    decode_heap_oop(dst);
5711  } else
5712#endif
5713    movptr(dst, src);
5714}
5715
5716// Doesn't do verfication, generates fixed size code
5717void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
5718#ifdef _LP64
5719  if (UseCompressedOops) {
5720    movl(dst, src);
5721    decode_heap_oop_not_null(dst);
5722  } else
5723#endif
5724    movptr(dst, src);
5725}
5726
5727void MacroAssembler::store_heap_oop(Address dst, Register src) {
5728#ifdef _LP64
5729  if (UseCompressedOops) {
5730    assert(!dst.uses(src), "not enough registers");
5731    encode_heap_oop(src);
5732    movl(dst, src);
5733  } else
5734#endif
5735    movptr(dst, src);
5736}
5737
5738void MacroAssembler::cmp_heap_oop(Register src1, Address src2, Register tmp) {
5739  assert_different_registers(src1, tmp);
5740#ifdef _LP64
5741  if (UseCompressedOops) {
5742    bool did_push = false;
5743    if (tmp == noreg) {
5744      tmp = rax;
5745      push(tmp);
5746      did_push = true;
5747      assert(!src2.uses(rsp), "can't push");
5748    }
5749    load_heap_oop(tmp, src2);
5750    cmpptr(src1, tmp);
5751    if (did_push)  pop(tmp);
5752  } else
5753#endif
5754    cmpptr(src1, src2);
5755}
5756
5757// Used for storing NULLs.
5758void MacroAssembler::store_heap_oop_null(Address dst) {
5759#ifdef _LP64
5760  if (UseCompressedOops) {
5761    movl(dst, (int32_t)NULL_WORD);
5762  } else {
5763    movslq(dst, (int32_t)NULL_WORD);
5764  }
5765#else
5766  movl(dst, (int32_t)NULL_WORD);
5767#endif
5768}
5769
5770#ifdef _LP64
5771void MacroAssembler::store_klass_gap(Register dst, Register src) {
5772  if (UseCompressedClassPointers) {
5773    // Store to klass gap in destination
5774    movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5775  }
5776}
5777
5778#ifdef ASSERT
5779void MacroAssembler::verify_heapbase(const char* msg) {
5780  assert (UseCompressedOops, "should be compressed");
5781  assert (Universe::heap() != NULL, "java heap should be initialized");
5782  if (CheckCompressedOops) {
5783    Label ok;
5784    push(rscratch1); // cmpptr trashes rscratch1
5785    cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
5786    jcc(Assembler::equal, ok);
5787    STOP(msg);
5788    bind(ok);
5789    pop(rscratch1);
5790  }
5791}
5792#endif
5793
5794// Algorithm must match oop.inline.hpp encode_heap_oop.
5795void MacroAssembler::encode_heap_oop(Register r) {
5796#ifdef ASSERT
5797  verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5798#endif
5799  verify_oop(r, "broken oop in encode_heap_oop");
5800  if (Universe::narrow_oop_base() == NULL) {
5801    if (Universe::narrow_oop_shift() != 0) {
5802      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5803      shrq(r, LogMinObjAlignmentInBytes);
5804    }
5805    return;
5806  }
5807  testq(r, r);
5808  cmovq(Assembler::equal, r, r12_heapbase);
5809  subq(r, r12_heapbase);
5810  shrq(r, LogMinObjAlignmentInBytes);
5811}
5812
5813void MacroAssembler::encode_heap_oop_not_null(Register r) {
5814#ifdef ASSERT
5815  verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5816  if (CheckCompressedOops) {
5817    Label ok;
5818    testq(r, r);
5819    jcc(Assembler::notEqual, ok);
5820    STOP("null oop passed to encode_heap_oop_not_null");
5821    bind(ok);
5822  }
5823#endif
5824  verify_oop(r, "broken oop in encode_heap_oop_not_null");
5825  if (Universe::narrow_oop_base() != NULL) {
5826    subq(r, r12_heapbase);
5827  }
5828  if (Universe::narrow_oop_shift() != 0) {
5829    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5830    shrq(r, LogMinObjAlignmentInBytes);
5831  }
5832}
5833
5834void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5835#ifdef ASSERT
5836  verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5837  if (CheckCompressedOops) {
5838    Label ok;
5839    testq(src, src);
5840    jcc(Assembler::notEqual, ok);
5841    STOP("null oop passed to encode_heap_oop_not_null2");
5842    bind(ok);
5843  }
5844#endif
5845  verify_oop(src, "broken oop in encode_heap_oop_not_null2");
5846  if (dst != src) {
5847    movq(dst, src);
5848  }
5849  if (Universe::narrow_oop_base() != NULL) {
5850    subq(dst, r12_heapbase);
5851  }
5852  if (Universe::narrow_oop_shift() != 0) {
5853    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5854    shrq(dst, LogMinObjAlignmentInBytes);
5855  }
5856}
5857
5858void  MacroAssembler::decode_heap_oop(Register r) {
5859#ifdef ASSERT
5860  verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5861#endif
5862  if (Universe::narrow_oop_base() == NULL) {
5863    if (Universe::narrow_oop_shift() != 0) {
5864      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5865      shlq(r, LogMinObjAlignmentInBytes);
5866    }
5867  } else {
5868    Label done;
5869    shlq(r, LogMinObjAlignmentInBytes);
5870    jccb(Assembler::equal, done);
5871    addq(r, r12_heapbase);
5872    bind(done);
5873  }
5874  verify_oop(r, "broken oop in decode_heap_oop");
5875}
5876
5877void  MacroAssembler::decode_heap_oop_not_null(Register r) {
5878  // Note: it will change flags
5879  assert (UseCompressedOops, "should only be used for compressed headers");
5880  assert (Universe::heap() != NULL, "java heap should be initialized");
5881  // Cannot assert, unverified entry point counts instructions (see .ad file)
5882  // vtableStubs also counts instructions in pd_code_size_limit.
5883  // Also do not verify_oop as this is called by verify_oop.
5884  if (Universe::narrow_oop_shift() != 0) {
5885    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5886    shlq(r, LogMinObjAlignmentInBytes);
5887    if (Universe::narrow_oop_base() != NULL) {
5888      addq(r, r12_heapbase);
5889    }
5890  } else {
5891    assert (Universe::narrow_oop_base() == NULL, "sanity");
5892  }
5893}
5894
5895void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5896  // Note: it will change flags
5897  assert (UseCompressedOops, "should only be used for compressed headers");
5898  assert (Universe::heap() != NULL, "java heap should be initialized");
5899  // Cannot assert, unverified entry point counts instructions (see .ad file)
5900  // vtableStubs also counts instructions in pd_code_size_limit.
5901  // Also do not verify_oop as this is called by verify_oop.
5902  if (Universe::narrow_oop_shift() != 0) {
5903    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5904    if (LogMinObjAlignmentInBytes == Address::times_8) {
5905      leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5906    } else {
5907      if (dst != src) {
5908        movq(dst, src);
5909      }
5910      shlq(dst, LogMinObjAlignmentInBytes);
5911      if (Universe::narrow_oop_base() != NULL) {
5912        addq(dst, r12_heapbase);
5913      }
5914    }
5915  } else {
5916    assert (Universe::narrow_oop_base() == NULL, "sanity");
5917    if (dst != src) {
5918      movq(dst, src);
5919    }
5920  }
5921}
5922
5923void MacroAssembler::encode_klass_not_null(Register r) {
5924  if (Universe::narrow_klass_base() != NULL) {
5925    // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5926    assert(r != r12_heapbase, "Encoding a klass in r12");
5927    mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
5928    subq(r, r12_heapbase);
5929  }
5930  if (Universe::narrow_klass_shift() != 0) {
5931    assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5932    shrq(r, LogKlassAlignmentInBytes);
5933  }
5934  if (Universe::narrow_klass_base() != NULL) {
5935    reinit_heapbase();
5936  }
5937}
5938
5939void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
5940  if (dst == src) {
5941    encode_klass_not_null(src);
5942  } else {
5943    if (Universe::narrow_klass_base() != NULL) {
5944      mov64(dst, (int64_t)Universe::narrow_klass_base());
5945      negq(dst);
5946      addq(dst, src);
5947    } else {
5948      movptr(dst, src);
5949    }
5950    if (Universe::narrow_klass_shift() != 0) {
5951      assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5952      shrq(dst, LogKlassAlignmentInBytes);
5953    }
5954  }
5955}
5956
5957// Function instr_size_for_decode_klass_not_null() counts the instructions
5958// generated by decode_klass_not_null(register r) and reinit_heapbase(),
5959// when (Universe::heap() != NULL).  Hence, if the instructions they
5960// generate change, then this method needs to be updated.
5961int MacroAssembler::instr_size_for_decode_klass_not_null() {
5962  assert (UseCompressedClassPointers, "only for compressed klass ptrs");
5963  if (Universe::narrow_klass_base() != NULL) {
5964    // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
5965    return (Universe::narrow_klass_shift() == 0 ? 20 : 24);
5966  } else {
5967    // longest load decode klass function, mov64, leaq
5968    return 16;
5969  }
5970}
5971
5972// !!! If the instructions that get generated here change then function
5973// instr_size_for_decode_klass_not_null() needs to get updated.
5974void  MacroAssembler::decode_klass_not_null(Register r) {
5975  // Note: it will change flags
5976  assert (UseCompressedClassPointers, "should only be used for compressed headers");
5977  assert(r != r12_heapbase, "Decoding a klass in r12");
5978  // Cannot assert, unverified entry point counts instructions (see .ad file)
5979  // vtableStubs also counts instructions in pd_code_size_limit.
5980  // Also do not verify_oop as this is called by verify_oop.
5981  if (Universe::narrow_klass_shift() != 0) {
5982    assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5983    shlq(r, LogKlassAlignmentInBytes);
5984  }
5985  // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5986  if (Universe::narrow_klass_base() != NULL) {
5987    mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
5988    addq(r, r12_heapbase);
5989    reinit_heapbase();
5990  }
5991}
5992
5993void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
5994  // Note: it will change flags
5995  assert (UseCompressedClassPointers, "should only be used for compressed headers");
5996  if (dst == src) {
5997    decode_klass_not_null(dst);
5998  } else {
5999    // Cannot assert, unverified entry point counts instructions (see .ad file)
6000    // vtableStubs also counts instructions in pd_code_size_limit.
6001    // Also do not verify_oop as this is called by verify_oop.
6002    mov64(dst, (int64_t)Universe::narrow_klass_base());
6003    if (Universe::narrow_klass_shift() != 0) {
6004      assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
6005      assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
6006      leaq(dst, Address(dst, src, Address::times_8, 0));
6007    } else {
6008      addq(dst, src);
6009    }
6010  }
6011}
6012
6013void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
6014  assert (UseCompressedOops, "should only be used for compressed headers");
6015  assert (Universe::heap() != NULL, "java heap should be initialized");
6016  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6017  int oop_index = oop_recorder()->find_index(obj);
6018  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6019  mov_narrow_oop(dst, oop_index, rspec);
6020}
6021
6022void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
6023  assert (UseCompressedOops, "should only be used for compressed headers");
6024  assert (Universe::heap() != NULL, "java heap should be initialized");
6025  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6026  int oop_index = oop_recorder()->find_index(obj);
6027  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6028  mov_narrow_oop(dst, oop_index, rspec);
6029}
6030
6031void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
6032  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6033  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6034  int klass_index = oop_recorder()->find_index(k);
6035  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6036  mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
6037}
6038
6039void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
6040  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6041  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6042  int klass_index = oop_recorder()->find_index(k);
6043  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6044  mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
6045}
6046
6047void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
6048  assert (UseCompressedOops, "should only be used for compressed headers");
6049  assert (Universe::heap() != NULL, "java heap should be initialized");
6050  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6051  int oop_index = oop_recorder()->find_index(obj);
6052  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6053  Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6054}
6055
6056void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
6057  assert (UseCompressedOops, "should only be used for compressed headers");
6058  assert (Universe::heap() != NULL, "java heap should be initialized");
6059  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6060  int oop_index = oop_recorder()->find_index(obj);
6061  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6062  Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6063}
6064
6065void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
6066  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6067  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6068  int klass_index = oop_recorder()->find_index(k);
6069  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6070  Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
6071}
6072
6073void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
6074  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6075  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6076  int klass_index = oop_recorder()->find_index(k);
6077  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6078  Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
6079}
6080
6081void MacroAssembler::reinit_heapbase() {
6082  if (UseCompressedOops || UseCompressedClassPointers) {
6083    if (Universe::heap() != NULL) {
6084      if (Universe::narrow_oop_base() == NULL) {
6085        MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
6086      } else {
6087        mov64(r12_heapbase, (int64_t)Universe::narrow_ptrs_base());
6088      }
6089    } else {
6090      movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
6091    }
6092  }
6093}
6094
6095#endif // _LP64
6096
6097
6098// C2 compiled method's prolog code.
6099void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b) {
6100
6101  // WARNING: Initial instruction MUST be 5 bytes or longer so that
6102  // NativeJump::patch_verified_entry will be able to patch out the entry
6103  // code safely. The push to verify stack depth is ok at 5 bytes,
6104  // the frame allocation can be either 3 or 6 bytes. So if we don't do
6105  // stack bang then we must use the 6 byte frame allocation even if
6106  // we have no frame. :-(
6107  assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
6108
6109  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
6110  // Remove word for return addr
6111  framesize -= wordSize;
6112  stack_bang_size -= wordSize;
6113
6114  // Calls to C2R adapters often do not accept exceptional returns.
6115  // We require that their callers must bang for them.  But be careful, because
6116  // some VM calls (such as call site linkage) can use several kilobytes of
6117  // stack.  But the stack safety zone should account for that.
6118  // See bugs 4446381, 4468289, 4497237.
6119  if (stack_bang_size > 0) {
6120    generate_stack_overflow_check(stack_bang_size);
6121
6122    // We always push rbp, so that on return to interpreter rbp, will be
6123    // restored correctly and we can correct the stack.
6124    push(rbp);
6125    // Remove word for ebp
6126    framesize -= wordSize;
6127
6128    // Create frame
6129    if (framesize) {
6130      subptr(rsp, framesize);
6131    }
6132  } else {
6133    // Create frame (force generation of a 4 byte immediate value)
6134    subptr_imm32(rsp, framesize);
6135
6136    // Save RBP register now.
6137    framesize -= wordSize;
6138    movptr(Address(rsp, framesize), rbp);
6139  }
6140
6141  if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
6142    framesize -= wordSize;
6143    movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
6144  }
6145
6146#ifndef _LP64
6147  // If method sets FPU control word do it now
6148  if (fp_mode_24b) {
6149    fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
6150  }
6151  if (UseSSE >= 2 && VerifyFPU) {
6152    verify_FPU(0, "FPU stack must be clean on entry");
6153  }
6154#endif
6155
6156#ifdef ASSERT
6157  if (VerifyStackAtCalls) {
6158    Label L;
6159    push(rax);
6160    mov(rax, rsp);
6161    andptr(rax, StackAlignmentInBytes-1);
6162    cmpptr(rax, StackAlignmentInBytes-wordSize);
6163    pop(rax);
6164    jcc(Assembler::equal, L);
6165    STOP("Stack is not properly aligned!");
6166    bind(L);
6167  }
6168#endif
6169
6170}
6171
6172void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) {
6173  // cnt - number of qwords (8-byte words).
6174  // base - start address, qword aligned.
6175  assert(base==rdi, "base register must be edi for rep stos");
6176  assert(tmp==rax,   "tmp register must be eax for rep stos");
6177  assert(cnt==rcx,   "cnt register must be ecx for rep stos");
6178
6179  xorptr(tmp, tmp);
6180  if (UseFastStosb) {
6181    shlptr(cnt,3); // convert to number of bytes
6182    rep_stosb();
6183  } else {
6184    NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
6185    rep_stos();
6186  }
6187}
6188
6189// IndexOf for constant substrings with size >= 8 chars
6190// which don't need to be loaded through stack.
6191void MacroAssembler::string_indexofC8(Register str1, Register str2,
6192                                      Register cnt1, Register cnt2,
6193                                      int int_cnt2,  Register result,
6194                                      XMMRegister vec, Register tmp) {
6195  ShortBranchVerifier sbv(this);
6196  assert(UseSSE42Intrinsics, "SSE4.2 is required");
6197
6198  // This method uses pcmpestri inxtruction with bound registers
6199  //   inputs:
6200  //     xmm - substring
6201  //     rax - substring length (elements count)
6202  //     mem - scanned string
6203  //     rdx - string length (elements count)
6204  //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6205  //   outputs:
6206  //     rcx - matched index in string
6207  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6208
6209  Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
6210        RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
6211        MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
6212
6213  // Note, inline_string_indexOf() generates checks:
6214  // if (substr.count > string.count) return -1;
6215  // if (substr.count == 0) return 0;
6216  assert(int_cnt2 >= 8, "this code isused only for cnt2 >= 8 chars");
6217
6218  // Load substring.
6219  movdqu(vec, Address(str2, 0));
6220  movl(cnt2, int_cnt2);
6221  movptr(result, str1); // string addr
6222
6223  if (int_cnt2 > 8) {
6224    jmpb(SCAN_TO_SUBSTR);
6225
6226    // Reload substr for rescan, this code
6227    // is executed only for large substrings (> 8 chars)
6228    bind(RELOAD_SUBSTR);
6229    movdqu(vec, Address(str2, 0));
6230    negptr(cnt2); // Jumped here with negative cnt2, convert to positive
6231
6232    bind(RELOAD_STR);
6233    // We came here after the beginning of the substring was
6234    // matched but the rest of it was not so we need to search
6235    // again. Start from the next element after the previous match.
6236
6237    // cnt2 is number of substring reminding elements and
6238    // cnt1 is number of string reminding elements when cmp failed.
6239    // Restored cnt1 = cnt1 - cnt2 + int_cnt2
6240    subl(cnt1, cnt2);
6241    addl(cnt1, int_cnt2);
6242    movl(cnt2, int_cnt2); // Now restore cnt2
6243
6244    decrementl(cnt1);     // Shift to next element
6245    cmpl(cnt1, cnt2);
6246    jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6247
6248    addptr(result, 2);
6249
6250  } // (int_cnt2 > 8)
6251
6252  // Scan string for start of substr in 16-byte vectors
6253  bind(SCAN_TO_SUBSTR);
6254  pcmpestri(vec, Address(result, 0), 0x0d);
6255  jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6256  subl(cnt1, 8);
6257  jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6258  cmpl(cnt1, cnt2);
6259  jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6260  addptr(result, 16);
6261  jmpb(SCAN_TO_SUBSTR);
6262
6263  // Found a potential substr
6264  bind(FOUND_CANDIDATE);
6265  // Matched whole vector if first element matched (tmp(rcx) == 0).
6266  if (int_cnt2 == 8) {
6267    jccb(Assembler::overflow, RET_FOUND);    // OF == 1
6268  } else { // int_cnt2 > 8
6269    jccb(Assembler::overflow, FOUND_SUBSTR);
6270  }
6271  // After pcmpestri tmp(rcx) contains matched element index
6272  // Compute start addr of substr
6273  lea(result, Address(result, tmp, Address::times_2));
6274
6275  // Make sure string is still long enough
6276  subl(cnt1, tmp);
6277  cmpl(cnt1, cnt2);
6278  if (int_cnt2 == 8) {
6279    jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6280  } else { // int_cnt2 > 8
6281    jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
6282  }
6283  // Left less then substring.
6284
6285  bind(RET_NOT_FOUND);
6286  movl(result, -1);
6287  jmpb(EXIT);
6288
6289  if (int_cnt2 > 8) {
6290    // This code is optimized for the case when whole substring
6291    // is matched if its head is matched.
6292    bind(MATCH_SUBSTR_HEAD);
6293    pcmpestri(vec, Address(result, 0), 0x0d);
6294    // Reload only string if does not match
6295    jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0
6296
6297    Label CONT_SCAN_SUBSTR;
6298    // Compare the rest of substring (> 8 chars).
6299    bind(FOUND_SUBSTR);
6300    // First 8 chars are already matched.
6301    negptr(cnt2);
6302    addptr(cnt2, 8);
6303
6304    bind(SCAN_SUBSTR);
6305    subl(cnt1, 8);
6306    cmpl(cnt2, -8); // Do not read beyond substring
6307    jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
6308    // Back-up strings to avoid reading beyond substring:
6309    // cnt1 = cnt1 - cnt2 + 8
6310    addl(cnt1, cnt2); // cnt2 is negative
6311    addl(cnt1, 8);
6312    movl(cnt2, 8); negptr(cnt2);
6313    bind(CONT_SCAN_SUBSTR);
6314    if (int_cnt2 < (int)G) {
6315      movdqu(vec, Address(str2, cnt2, Address::times_2, int_cnt2*2));
6316      pcmpestri(vec, Address(result, cnt2, Address::times_2, int_cnt2*2), 0x0d);
6317    } else {
6318      // calculate index in register to avoid integer overflow (int_cnt2*2)
6319      movl(tmp, int_cnt2);
6320      addptr(tmp, cnt2);
6321      movdqu(vec, Address(str2, tmp, Address::times_2, 0));
6322      pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d);
6323    }
6324    // Need to reload strings pointers if not matched whole vector
6325    jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6326    addptr(cnt2, 8);
6327    jcc(Assembler::negative, SCAN_SUBSTR);
6328    // Fall through if found full substring
6329
6330  } // (int_cnt2 > 8)
6331
6332  bind(RET_FOUND);
6333  // Found result if we matched full small substring.
6334  // Compute substr offset
6335  subptr(result, str1);
6336  shrl(result, 1); // index
6337  bind(EXIT);
6338
6339} // string_indexofC8
6340
6341// Small strings are loaded through stack if they cross page boundary.
6342void MacroAssembler::string_indexof(Register str1, Register str2,
6343                                    Register cnt1, Register cnt2,
6344                                    int int_cnt2,  Register result,
6345                                    XMMRegister vec, Register tmp) {
6346  ShortBranchVerifier sbv(this);
6347  assert(UseSSE42Intrinsics, "SSE4.2 is required");
6348  //
6349  // int_cnt2 is length of small (< 8 chars) constant substring
6350  // or (-1) for non constant substring in which case its length
6351  // is in cnt2 register.
6352  //
6353  // Note, inline_string_indexOf() generates checks:
6354  // if (substr.count > string.count) return -1;
6355  // if (substr.count == 0) return 0;
6356  //
6357  assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < 8), "should be != 0");
6358
6359  // This method uses pcmpestri inxtruction with bound registers
6360  //   inputs:
6361  //     xmm - substring
6362  //     rax - substring length (elements count)
6363  //     mem - scanned string
6364  //     rdx - string length (elements count)
6365  //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6366  //   outputs:
6367  //     rcx - matched index in string
6368  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6369
6370  Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
6371        RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
6372        FOUND_CANDIDATE;
6373
6374  { //========================================================
6375    // We don't know where these strings are located
6376    // and we can't read beyond them. Load them through stack.
6377    Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
6378
6379    movptr(tmp, rsp); // save old SP
6380
6381    if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
6382      if (int_cnt2 == 1) {  // One char
6383        load_unsigned_short(result, Address(str2, 0));
6384        movdl(vec, result); // move 32 bits
6385      } else if (int_cnt2 == 2) { // Two chars
6386        movdl(vec, Address(str2, 0)); // move 32 bits
6387      } else if (int_cnt2 == 4) { // Four chars
6388        movq(vec, Address(str2, 0));  // move 64 bits
6389      } else { // cnt2 = { 3, 5, 6, 7 }
6390        // Array header size is 12 bytes in 32-bit VM
6391        // + 6 bytes for 3 chars == 18 bytes,
6392        // enough space to load vec and shift.
6393        assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
6394        movdqu(vec, Address(str2, (int_cnt2*2)-16));
6395        psrldq(vec, 16-(int_cnt2*2));
6396      }
6397    } else { // not constant substring
6398      cmpl(cnt2, 8);
6399      jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
6400
6401      // We can read beyond string if srt+16 does not cross page boundary
6402      // since heaps are aligned and mapped by pages.
6403      assert(os::vm_page_size() < (int)G, "default page should be small");
6404      movl(result, str2); // We need only low 32 bits
6405      andl(result, (os::vm_page_size()-1));
6406      cmpl(result, (os::vm_page_size()-16));
6407      jccb(Assembler::belowEqual, CHECK_STR);
6408
6409      // Move small strings to stack to allow load 16 bytes into vec.
6410      subptr(rsp, 16);
6411      int stk_offset = wordSize-2;
6412      push(cnt2);
6413
6414      bind(COPY_SUBSTR);
6415      load_unsigned_short(result, Address(str2, cnt2, Address::times_2, -2));
6416      movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
6417      decrement(cnt2);
6418      jccb(Assembler::notZero, COPY_SUBSTR);
6419
6420      pop(cnt2);
6421      movptr(str2, rsp);  // New substring address
6422    } // non constant
6423
6424    bind(CHECK_STR);
6425    cmpl(cnt1, 8);
6426    jccb(Assembler::aboveEqual, BIG_STRINGS);
6427
6428    // Check cross page boundary.
6429    movl(result, str1); // We need only low 32 bits
6430    andl(result, (os::vm_page_size()-1));
6431    cmpl(result, (os::vm_page_size()-16));
6432    jccb(Assembler::belowEqual, BIG_STRINGS);
6433
6434    subptr(rsp, 16);
6435    int stk_offset = -2;
6436    if (int_cnt2 < 0) { // not constant
6437      push(cnt2);
6438      stk_offset += wordSize;
6439    }
6440    movl(cnt2, cnt1);
6441
6442    bind(COPY_STR);
6443    load_unsigned_short(result, Address(str1, cnt2, Address::times_2, -2));
6444    movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
6445    decrement(cnt2);
6446    jccb(Assembler::notZero, COPY_STR);
6447
6448    if (int_cnt2 < 0) { // not constant
6449      pop(cnt2);
6450    }
6451    movptr(str1, rsp);  // New string address
6452
6453    bind(BIG_STRINGS);
6454    // Load substring.
6455    if (int_cnt2 < 0) { // -1
6456      movdqu(vec, Address(str2, 0));
6457      push(cnt2);       // substr count
6458      push(str2);       // substr addr
6459      push(str1);       // string addr
6460    } else {
6461      // Small (< 8 chars) constant substrings are loaded already.
6462      movl(cnt2, int_cnt2);
6463    }
6464    push(tmp);  // original SP
6465
6466  } // Finished loading
6467
6468  //========================================================
6469  // Start search
6470  //
6471
6472  movptr(result, str1); // string addr
6473
6474  if (int_cnt2  < 0) {  // Only for non constant substring
6475    jmpb(SCAN_TO_SUBSTR);
6476
6477    // SP saved at sp+0
6478    // String saved at sp+1*wordSize
6479    // Substr saved at sp+2*wordSize
6480    // Substr count saved at sp+3*wordSize
6481
6482    // Reload substr for rescan, this code
6483    // is executed only for large substrings (> 8 chars)
6484    bind(RELOAD_SUBSTR);
6485    movptr(str2, Address(rsp, 2*wordSize));
6486    movl(cnt2, Address(rsp, 3*wordSize));
6487    movdqu(vec, Address(str2, 0));
6488    // We came here after the beginning of the substring was
6489    // matched but the rest of it was not so we need to search
6490    // again. Start from the next element after the previous match.
6491    subptr(str1, result); // Restore counter
6492    shrl(str1, 1);
6493    addl(cnt1, str1);
6494    decrementl(cnt1);   // Shift to next element
6495    cmpl(cnt1, cnt2);
6496    jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6497
6498    addptr(result, 2);
6499  } // non constant
6500
6501  // Scan string for start of substr in 16-byte vectors
6502  bind(SCAN_TO_SUBSTR);
6503  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6504  pcmpestri(vec, Address(result, 0), 0x0d);
6505  jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6506  subl(cnt1, 8);
6507  jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6508  cmpl(cnt1, cnt2);
6509  jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6510  addptr(result, 16);
6511
6512  bind(ADJUST_STR);
6513  cmpl(cnt1, 8); // Do not read beyond string
6514  jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6515  // Back-up string to avoid reading beyond string.
6516  lea(result, Address(result, cnt1, Address::times_2, -16));
6517  movl(cnt1, 8);
6518  jmpb(SCAN_TO_SUBSTR);
6519
6520  // Found a potential substr
6521  bind(FOUND_CANDIDATE);
6522  // After pcmpestri tmp(rcx) contains matched element index
6523
6524  // Make sure string is still long enough
6525  subl(cnt1, tmp);
6526  cmpl(cnt1, cnt2);
6527  jccb(Assembler::greaterEqual, FOUND_SUBSTR);
6528  // Left less then substring.
6529
6530  bind(RET_NOT_FOUND);
6531  movl(result, -1);
6532  jmpb(CLEANUP);
6533
6534  bind(FOUND_SUBSTR);
6535  // Compute start addr of substr
6536  lea(result, Address(result, tmp, Address::times_2));
6537
6538  if (int_cnt2 > 0) { // Constant substring
6539    // Repeat search for small substring (< 8 chars)
6540    // from new point without reloading substring.
6541    // Have to check that we don't read beyond string.
6542    cmpl(tmp, 8-int_cnt2);
6543    jccb(Assembler::greater, ADJUST_STR);
6544    // Fall through if matched whole substring.
6545  } else { // non constant
6546    assert(int_cnt2 == -1, "should be != 0");
6547
6548    addl(tmp, cnt2);
6549    // Found result if we matched whole substring.
6550    cmpl(tmp, 8);
6551    jccb(Assembler::lessEqual, RET_FOUND);
6552
6553    // Repeat search for small substring (<= 8 chars)
6554    // from new point 'str1' without reloading substring.
6555    cmpl(cnt2, 8);
6556    // Have to check that we don't read beyond string.
6557    jccb(Assembler::lessEqual, ADJUST_STR);
6558
6559    Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
6560    // Compare the rest of substring (> 8 chars).
6561    movptr(str1, result);
6562
6563    cmpl(tmp, cnt2);
6564    // First 8 chars are already matched.
6565    jccb(Assembler::equal, CHECK_NEXT);
6566
6567    bind(SCAN_SUBSTR);
6568    pcmpestri(vec, Address(str1, 0), 0x0d);
6569    // Need to reload strings pointers if not matched whole vector
6570    jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6571
6572    bind(CHECK_NEXT);
6573    subl(cnt2, 8);
6574    jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
6575    addptr(str1, 16);
6576    addptr(str2, 16);
6577    subl(cnt1, 8);
6578    cmpl(cnt2, 8); // Do not read beyond substring
6579    jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
6580    // Back-up strings to avoid reading beyond substring.
6581    lea(str2, Address(str2, cnt2, Address::times_2, -16));
6582    lea(str1, Address(str1, cnt2, Address::times_2, -16));
6583    subl(cnt1, cnt2);
6584    movl(cnt2, 8);
6585    addl(cnt1, 8);
6586    bind(CONT_SCAN_SUBSTR);
6587    movdqu(vec, Address(str2, 0));
6588    jmpb(SCAN_SUBSTR);
6589
6590    bind(RET_FOUND_LONG);
6591    movptr(str1, Address(rsp, wordSize));
6592  } // non constant
6593
6594  bind(RET_FOUND);
6595  // Compute substr offset
6596  subptr(result, str1);
6597  shrl(result, 1); // index
6598
6599  bind(CLEANUP);
6600  pop(rsp); // restore SP
6601
6602} // string_indexof
6603
6604// Compare strings.
6605void MacroAssembler::string_compare(Register str1, Register str2,
6606                                    Register cnt1, Register cnt2, Register result,
6607                                    XMMRegister vec1) {
6608  ShortBranchVerifier sbv(this);
6609  Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
6610
6611  // Compute the minimum of the string lengths and the
6612  // difference of the string lengths (stack).
6613  // Do the conditional move stuff
6614  movl(result, cnt1);
6615  subl(cnt1, cnt2);
6616  push(cnt1);
6617  cmov32(Assembler::lessEqual, cnt2, result);
6618
6619  // Is the minimum length zero?
6620  testl(cnt2, cnt2);
6621  jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6622
6623  // Compare first characters
6624  load_unsigned_short(result, Address(str1, 0));
6625  load_unsigned_short(cnt1, Address(str2, 0));
6626  subl(result, cnt1);
6627  jcc(Assembler::notZero,  POP_LABEL);
6628  cmpl(cnt2, 1);
6629  jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6630
6631  // Check if the strings start at the same location.
6632  cmpptr(str1, str2);
6633  jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6634
6635  Address::ScaleFactor scale = Address::times_2;
6636  int stride = 8;
6637
6638  if (UseAVX >= 2 && UseSSE42Intrinsics) {
6639    Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
6640    Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
6641    Label COMPARE_TAIL_LONG;
6642    int pcmpmask = 0x19;
6643
6644    // Setup to compare 16-chars (32-bytes) vectors,
6645    // start from first character again because it has aligned address.
6646    int stride2 = 16;
6647    int adr_stride  = stride  << scale;
6648    int adr_stride2 = stride2 << scale;
6649
6650    assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6651    // rax and rdx are used by pcmpestri as elements counters
6652    movl(result, cnt2);
6653    andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
6654    jcc(Assembler::zero, COMPARE_TAIL_LONG);
6655
6656    // fast path : compare first 2 8-char vectors.
6657    bind(COMPARE_16_CHARS);
6658    movdqu(vec1, Address(str1, 0));
6659    pcmpestri(vec1, Address(str2, 0), pcmpmask);
6660    jccb(Assembler::below, COMPARE_INDEX_CHAR);
6661
6662    movdqu(vec1, Address(str1, adr_stride));
6663    pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
6664    jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
6665    addl(cnt1, stride);
6666
6667    // Compare the characters at index in cnt1
6668    bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character
6669    load_unsigned_short(result, Address(str1, cnt1, scale));
6670    load_unsigned_short(cnt2, Address(str2, cnt1, scale));
6671    subl(result, cnt2);
6672    jmp(POP_LABEL);
6673
6674    // Setup the registers to start vector comparison loop
6675    bind(COMPARE_WIDE_VECTORS);
6676    lea(str1, Address(str1, result, scale));
6677    lea(str2, Address(str2, result, scale));
6678    subl(result, stride2);
6679    subl(cnt2, stride2);
6680    jccb(Assembler::zero, COMPARE_WIDE_TAIL);
6681    negptr(result);
6682
6683    //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
6684    bind(COMPARE_WIDE_VECTORS_LOOP);
6685    vmovdqu(vec1, Address(str1, result, scale));
6686    vpxor(vec1, Address(str2, result, scale));
6687    vptest(vec1, vec1);
6688    jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
6689    addptr(result, stride2);
6690    subl(cnt2, stride2);
6691    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
6692    // clean upper bits of YMM registers
6693    vzeroupper();
6694
6695    // compare wide vectors tail
6696    bind(COMPARE_WIDE_TAIL);
6697    testptr(result, result);
6698    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6699
6700    movl(result, stride2);
6701    movl(cnt2, result);
6702    negptr(result);
6703    jmpb(COMPARE_WIDE_VECTORS_LOOP);
6704
6705    // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
6706    bind(VECTOR_NOT_EQUAL);
6707    // clean upper bits of YMM registers
6708    vzeroupper();
6709    lea(str1, Address(str1, result, scale));
6710    lea(str2, Address(str2, result, scale));
6711    jmp(COMPARE_16_CHARS);
6712
6713    // Compare tail chars, length between 1 to 15 chars
6714    bind(COMPARE_TAIL_LONG);
6715    movl(cnt2, result);
6716    cmpl(cnt2, stride);
6717    jccb(Assembler::less, COMPARE_SMALL_STR);
6718
6719    movdqu(vec1, Address(str1, 0));
6720    pcmpestri(vec1, Address(str2, 0), pcmpmask);
6721    jcc(Assembler::below, COMPARE_INDEX_CHAR);
6722    subptr(cnt2, stride);
6723    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6724    lea(str1, Address(str1, result, scale));
6725    lea(str2, Address(str2, result, scale));
6726    negptr(cnt2);
6727    jmpb(WHILE_HEAD_LABEL);
6728
6729    bind(COMPARE_SMALL_STR);
6730  } else if (UseSSE42Intrinsics) {
6731    Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
6732    int pcmpmask = 0x19;
6733    // Setup to compare 8-char (16-byte) vectors,
6734    // start from first character again because it has aligned address.
6735    movl(result, cnt2);
6736    andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
6737    jccb(Assembler::zero, COMPARE_TAIL);
6738
6739    lea(str1, Address(str1, result, scale));
6740    lea(str2, Address(str2, result, scale));
6741    negptr(result);
6742
6743    // pcmpestri
6744    //   inputs:
6745    //     vec1- substring
6746    //     rax - negative string length (elements count)
6747    //     mem - scaned string
6748    //     rdx - string length (elements count)
6749    //     pcmpmask - cmp mode: 11000 (string compare with negated result)
6750    //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
6751    //   outputs:
6752    //     rcx - first mismatched element index
6753    assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6754
6755    bind(COMPARE_WIDE_VECTORS);
6756    movdqu(vec1, Address(str1, result, scale));
6757    pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6758    // After pcmpestri cnt1(rcx) contains mismatched element index
6759
6760    jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
6761    addptr(result, stride);
6762    subptr(cnt2, stride);
6763    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
6764
6765    // compare wide vectors tail
6766    testptr(result, result);
6767    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6768
6769    movl(cnt2, stride);
6770    movl(result, stride);
6771    negptr(result);
6772    movdqu(vec1, Address(str1, result, scale));
6773    pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6774    jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
6775
6776    // Mismatched characters in the vectors
6777    bind(VECTOR_NOT_EQUAL);
6778    addptr(cnt1, result);
6779    load_unsigned_short(result, Address(str1, cnt1, scale));
6780    load_unsigned_short(cnt2, Address(str2, cnt1, scale));
6781    subl(result, cnt2);
6782    jmpb(POP_LABEL);
6783
6784    bind(COMPARE_TAIL); // limit is zero
6785    movl(cnt2, result);
6786    // Fallthru to tail compare
6787  }
6788  // Shift str2 and str1 to the end of the arrays, negate min
6789  lea(str1, Address(str1, cnt2, scale));
6790  lea(str2, Address(str2, cnt2, scale));
6791  decrementl(cnt2);  // first character was compared already
6792  negptr(cnt2);
6793
6794  // Compare the rest of the elements
6795  bind(WHILE_HEAD_LABEL);
6796  load_unsigned_short(result, Address(str1, cnt2, scale, 0));
6797  load_unsigned_short(cnt1, Address(str2, cnt2, scale, 0));
6798  subl(result, cnt1);
6799  jccb(Assembler::notZero, POP_LABEL);
6800  increment(cnt2);
6801  jccb(Assembler::notZero, WHILE_HEAD_LABEL);
6802
6803  // Strings are equal up to min length.  Return the length difference.
6804  bind(LENGTH_DIFF_LABEL);
6805  pop(result);
6806  jmpb(DONE_LABEL);
6807
6808  // Discard the stored length difference
6809  bind(POP_LABEL);
6810  pop(cnt1);
6811
6812  // That's it
6813  bind(DONE_LABEL);
6814}
6815
6816// Compare char[] arrays aligned to 4 bytes or substrings.
6817void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2,
6818                                        Register limit, Register result, Register chr,
6819                                        XMMRegister vec1, XMMRegister vec2) {
6820  ShortBranchVerifier sbv(this);
6821  Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
6822
6823  int length_offset  = arrayOopDesc::length_offset_in_bytes();
6824  int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
6825
6826  // Check the input args
6827  cmpptr(ary1, ary2);
6828  jcc(Assembler::equal, TRUE_LABEL);
6829
6830  if (is_array_equ) {
6831    // Need additional checks for arrays_equals.
6832    testptr(ary1, ary1);
6833    jcc(Assembler::zero, FALSE_LABEL);
6834    testptr(ary2, ary2);
6835    jcc(Assembler::zero, FALSE_LABEL);
6836
6837    // Check the lengths
6838    movl(limit, Address(ary1, length_offset));
6839    cmpl(limit, Address(ary2, length_offset));
6840    jcc(Assembler::notEqual, FALSE_LABEL);
6841  }
6842
6843  // count == 0
6844  testl(limit, limit);
6845  jcc(Assembler::zero, TRUE_LABEL);
6846
6847  if (is_array_equ) {
6848    // Load array address
6849    lea(ary1, Address(ary1, base_offset));
6850    lea(ary2, Address(ary2, base_offset));
6851  }
6852
6853  shll(limit, 1);      // byte count != 0
6854  movl(result, limit); // copy
6855
6856  if (UseAVX >= 2) {
6857    // With AVX2, use 32-byte vector compare
6858    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6859
6860    // Compare 32-byte vectors
6861    andl(result, 0x0000001e);  //   tail count (in bytes)
6862    andl(limit, 0xffffffe0);   // vector count (in bytes)
6863    jccb(Assembler::zero, COMPARE_TAIL);
6864
6865    lea(ary1, Address(ary1, limit, Address::times_1));
6866    lea(ary2, Address(ary2, limit, Address::times_1));
6867    negptr(limit);
6868
6869    bind(COMPARE_WIDE_VECTORS);
6870    vmovdqu(vec1, Address(ary1, limit, Address::times_1));
6871    vmovdqu(vec2, Address(ary2, limit, Address::times_1));
6872    vpxor(vec1, vec2);
6873
6874    vptest(vec1, vec1);
6875    jccb(Assembler::notZero, FALSE_LABEL);
6876    addptr(limit, 32);
6877    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6878
6879    testl(result, result);
6880    jccb(Assembler::zero, TRUE_LABEL);
6881
6882    vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
6883    vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
6884    vpxor(vec1, vec2);
6885
6886    vptest(vec1, vec1);
6887    jccb(Assembler::notZero, FALSE_LABEL);
6888    jmpb(TRUE_LABEL);
6889
6890    bind(COMPARE_TAIL); // limit is zero
6891    movl(limit, result);
6892    // Fallthru to tail compare
6893  } else if (UseSSE42Intrinsics) {
6894    // With SSE4.2, use double quad vector compare
6895    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6896
6897    // Compare 16-byte vectors
6898    andl(result, 0x0000000e);  //   tail count (in bytes)
6899    andl(limit, 0xfffffff0);   // vector count (in bytes)
6900    jccb(Assembler::zero, COMPARE_TAIL);
6901
6902    lea(ary1, Address(ary1, limit, Address::times_1));
6903    lea(ary2, Address(ary2, limit, Address::times_1));
6904    negptr(limit);
6905
6906    bind(COMPARE_WIDE_VECTORS);
6907    movdqu(vec1, Address(ary1, limit, Address::times_1));
6908    movdqu(vec2, Address(ary2, limit, Address::times_1));
6909    pxor(vec1, vec2);
6910
6911    ptest(vec1, vec1);
6912    jccb(Assembler::notZero, FALSE_LABEL);
6913    addptr(limit, 16);
6914    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6915
6916    testl(result, result);
6917    jccb(Assembler::zero, TRUE_LABEL);
6918
6919    movdqu(vec1, Address(ary1, result, Address::times_1, -16));
6920    movdqu(vec2, Address(ary2, result, Address::times_1, -16));
6921    pxor(vec1, vec2);
6922
6923    ptest(vec1, vec1);
6924    jccb(Assembler::notZero, FALSE_LABEL);
6925    jmpb(TRUE_LABEL);
6926
6927    bind(COMPARE_TAIL); // limit is zero
6928    movl(limit, result);
6929    // Fallthru to tail compare
6930  }
6931
6932  // Compare 4-byte vectors
6933  andl(limit, 0xfffffffc); // vector count (in bytes)
6934  jccb(Assembler::zero, COMPARE_CHAR);
6935
6936  lea(ary1, Address(ary1, limit, Address::times_1));
6937  lea(ary2, Address(ary2, limit, Address::times_1));
6938  negptr(limit);
6939
6940  bind(COMPARE_VECTORS);
6941  movl(chr, Address(ary1, limit, Address::times_1));
6942  cmpl(chr, Address(ary2, limit, Address::times_1));
6943  jccb(Assembler::notEqual, FALSE_LABEL);
6944  addptr(limit, 4);
6945  jcc(Assembler::notZero, COMPARE_VECTORS);
6946
6947  // Compare trailing char (final 2 bytes), if any
6948  bind(COMPARE_CHAR);
6949  testl(result, 0x2);   // tail  char
6950  jccb(Assembler::zero, TRUE_LABEL);
6951  load_unsigned_short(chr, Address(ary1, 0));
6952  load_unsigned_short(limit, Address(ary2, 0));
6953  cmpl(chr, limit);
6954  jccb(Assembler::notEqual, FALSE_LABEL);
6955
6956  bind(TRUE_LABEL);
6957  movl(result, 1);   // return true
6958  jmpb(DONE);
6959
6960  bind(FALSE_LABEL);
6961  xorl(result, result); // return false
6962
6963  // That's it
6964  bind(DONE);
6965  if (UseAVX >= 2) {
6966    // clean upper bits of YMM registers
6967    vzeroupper();
6968  }
6969}
6970
6971void MacroAssembler::generate_fill(BasicType t, bool aligned,
6972                                   Register to, Register value, Register count,
6973                                   Register rtmp, XMMRegister xtmp) {
6974  ShortBranchVerifier sbv(this);
6975  assert_different_registers(to, value, count, rtmp);
6976  Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
6977  Label L_fill_2_bytes, L_fill_4_bytes;
6978
6979  int shift = -1;
6980  switch (t) {
6981    case T_BYTE:
6982      shift = 2;
6983      break;
6984    case T_SHORT:
6985      shift = 1;
6986      break;
6987    case T_INT:
6988      shift = 0;
6989      break;
6990    default: ShouldNotReachHere();
6991  }
6992
6993  if (t == T_BYTE) {
6994    andl(value, 0xff);
6995    movl(rtmp, value);
6996    shll(rtmp, 8);
6997    orl(value, rtmp);
6998  }
6999  if (t == T_SHORT) {
7000    andl(value, 0xffff);
7001  }
7002  if (t == T_BYTE || t == T_SHORT) {
7003    movl(rtmp, value);
7004    shll(rtmp, 16);
7005    orl(value, rtmp);
7006  }
7007
7008  cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
7009  jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
7010  if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
7011    // align source address at 4 bytes address boundary
7012    if (t == T_BYTE) {
7013      // One byte misalignment happens only for byte arrays
7014      testptr(to, 1);
7015      jccb(Assembler::zero, L_skip_align1);
7016      movb(Address(to, 0), value);
7017      increment(to);
7018      decrement(count);
7019      BIND(L_skip_align1);
7020    }
7021    // Two bytes misalignment happens only for byte and short (char) arrays
7022    testptr(to, 2);
7023    jccb(Assembler::zero, L_skip_align2);
7024    movw(Address(to, 0), value);
7025    addptr(to, 2);
7026    subl(count, 1<<(shift-1));
7027    BIND(L_skip_align2);
7028  }
7029  if (UseSSE < 2) {
7030    Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7031    // Fill 32-byte chunks
7032    subl(count, 8 << shift);
7033    jcc(Assembler::less, L_check_fill_8_bytes);
7034    align(16);
7035
7036    BIND(L_fill_32_bytes_loop);
7037
7038    for (int i = 0; i < 32; i += 4) {
7039      movl(Address(to, i), value);
7040    }
7041
7042    addptr(to, 32);
7043    subl(count, 8 << shift);
7044    jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7045    BIND(L_check_fill_8_bytes);
7046    addl(count, 8 << shift);
7047    jccb(Assembler::zero, L_exit);
7048    jmpb(L_fill_8_bytes);
7049
7050    //
7051    // length is too short, just fill qwords
7052    //
7053    BIND(L_fill_8_bytes_loop);
7054    movl(Address(to, 0), value);
7055    movl(Address(to, 4), value);
7056    addptr(to, 8);
7057    BIND(L_fill_8_bytes);
7058    subl(count, 1 << (shift + 1));
7059    jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7060    // fall through to fill 4 bytes
7061  } else {
7062    Label L_fill_32_bytes;
7063    if (!UseUnalignedLoadStores) {
7064      // align to 8 bytes, we know we are 4 byte aligned to start
7065      testptr(to, 4);
7066      jccb(Assembler::zero, L_fill_32_bytes);
7067      movl(Address(to, 0), value);
7068      addptr(to, 4);
7069      subl(count, 1<<shift);
7070    }
7071    BIND(L_fill_32_bytes);
7072    {
7073      assert( UseSSE >= 2, "supported cpu only" );
7074      Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7075      movdl(xtmp, value);
7076      if (UseAVX >= 2 && UseUnalignedLoadStores) {
7077        // Fill 64-byte chunks
7078        Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7079        vpbroadcastd(xtmp, xtmp);
7080
7081        subl(count, 16 << shift);
7082        jcc(Assembler::less, L_check_fill_32_bytes);
7083        align(16);
7084
7085        BIND(L_fill_64_bytes_loop);
7086        vmovdqu(Address(to, 0), xtmp);
7087        vmovdqu(Address(to, 32), xtmp);
7088        addptr(to, 64);
7089        subl(count, 16 << shift);
7090        jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7091
7092        BIND(L_check_fill_32_bytes);
7093        addl(count, 8 << shift);
7094        jccb(Assembler::less, L_check_fill_8_bytes);
7095        vmovdqu(Address(to, 0), xtmp);
7096        addptr(to, 32);
7097        subl(count, 8 << shift);
7098
7099        BIND(L_check_fill_8_bytes);
7100        // clean upper bits of YMM registers
7101        vzeroupper();
7102      } else {
7103        // Fill 32-byte chunks
7104        pshufd(xtmp, xtmp, 0);
7105
7106        subl(count, 8 << shift);
7107        jcc(Assembler::less, L_check_fill_8_bytes);
7108        align(16);
7109
7110        BIND(L_fill_32_bytes_loop);
7111
7112        if (UseUnalignedLoadStores) {
7113          movdqu(Address(to, 0), xtmp);
7114          movdqu(Address(to, 16), xtmp);
7115        } else {
7116          movq(Address(to, 0), xtmp);
7117          movq(Address(to, 8), xtmp);
7118          movq(Address(to, 16), xtmp);
7119          movq(Address(to, 24), xtmp);
7120        }
7121
7122        addptr(to, 32);
7123        subl(count, 8 << shift);
7124        jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7125
7126        BIND(L_check_fill_8_bytes);
7127      }
7128      addl(count, 8 << shift);
7129      jccb(Assembler::zero, L_exit);
7130      jmpb(L_fill_8_bytes);
7131
7132      //
7133      // length is too short, just fill qwords
7134      //
7135      BIND(L_fill_8_bytes_loop);
7136      movq(Address(to, 0), xtmp);
7137      addptr(to, 8);
7138      BIND(L_fill_8_bytes);
7139      subl(count, 1 << (shift + 1));
7140      jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7141    }
7142  }
7143  // fill trailing 4 bytes
7144  BIND(L_fill_4_bytes);
7145  testl(count, 1<<shift);
7146  jccb(Assembler::zero, L_fill_2_bytes);
7147  movl(Address(to, 0), value);
7148  if (t == T_BYTE || t == T_SHORT) {
7149    addptr(to, 4);
7150    BIND(L_fill_2_bytes);
7151    // fill trailing 2 bytes
7152    testl(count, 1<<(shift-1));
7153    jccb(Assembler::zero, L_fill_byte);
7154    movw(Address(to, 0), value);
7155    if (t == T_BYTE) {
7156      addptr(to, 2);
7157      BIND(L_fill_byte);
7158      // fill trailing byte
7159      testl(count, 1);
7160      jccb(Assembler::zero, L_exit);
7161      movb(Address(to, 0), value);
7162    } else {
7163      BIND(L_fill_byte);
7164    }
7165  } else {
7166    BIND(L_fill_2_bytes);
7167  }
7168  BIND(L_exit);
7169}
7170
7171// encode char[] to byte[] in ISO_8859_1
7172void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
7173                                      XMMRegister tmp1Reg, XMMRegister tmp2Reg,
7174                                      XMMRegister tmp3Reg, XMMRegister tmp4Reg,
7175                                      Register tmp5, Register result) {
7176  // rsi: src
7177  // rdi: dst
7178  // rdx: len
7179  // rcx: tmp5
7180  // rax: result
7181  ShortBranchVerifier sbv(this);
7182  assert_different_registers(src, dst, len, tmp5, result);
7183  Label L_done, L_copy_1_char, L_copy_1_char_exit;
7184
7185  // set result
7186  xorl(result, result);
7187  // check for zero length
7188  testl(len, len);
7189  jcc(Assembler::zero, L_done);
7190  movl(result, len);
7191
7192  // Setup pointers
7193  lea(src, Address(src, len, Address::times_2)); // char[]
7194  lea(dst, Address(dst, len, Address::times_1)); // byte[]
7195  negptr(len);
7196
7197  if (UseSSE42Intrinsics || UseAVX >= 2) {
7198    Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
7199    Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7200
7201    if (UseAVX >= 2) {
7202      Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7203      movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7204      movdl(tmp1Reg, tmp5);
7205      vpbroadcastd(tmp1Reg, tmp1Reg);
7206      jmpb(L_chars_32_check);
7207
7208      bind(L_copy_32_chars);
7209      vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7210      vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7211      vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
7212      vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7213      jccb(Assembler::notZero, L_copy_32_chars_exit);
7214      vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
7215      vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector256 */ true);
7216      vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7217
7218      bind(L_chars_32_check);
7219      addptr(len, 32);
7220      jccb(Assembler::lessEqual, L_copy_32_chars);
7221
7222      bind(L_copy_32_chars_exit);
7223      subptr(len, 16);
7224      jccb(Assembler::greater, L_copy_16_chars_exit);
7225
7226    } else if (UseSSE42Intrinsics) {
7227      movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7228      movdl(tmp1Reg, tmp5);
7229      pshufd(tmp1Reg, tmp1Reg, 0);
7230      jmpb(L_chars_16_check);
7231    }
7232
7233    bind(L_copy_16_chars);
7234    if (UseAVX >= 2) {
7235      vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7236      vptest(tmp2Reg, tmp1Reg);
7237      jccb(Assembler::notZero, L_copy_16_chars_exit);
7238      vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector256 */ true);
7239      vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector256 */ true);
7240    } else {
7241      if (UseAVX > 0) {
7242        movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7243        movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7244        vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ false);
7245      } else {
7246        movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7247        por(tmp2Reg, tmp3Reg);
7248        movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7249        por(tmp2Reg, tmp4Reg);
7250      }
7251      ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7252      jccb(Assembler::notZero, L_copy_16_chars_exit);
7253      packuswb(tmp3Reg, tmp4Reg);
7254    }
7255    movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7256
7257    bind(L_chars_16_check);
7258    addptr(len, 16);
7259    jccb(Assembler::lessEqual, L_copy_16_chars);
7260
7261    bind(L_copy_16_chars_exit);
7262    if (UseAVX >= 2) {
7263      // clean upper bits of YMM registers
7264      vzeroupper();
7265    }
7266    subptr(len, 8);
7267    jccb(Assembler::greater, L_copy_8_chars_exit);
7268
7269    bind(L_copy_8_chars);
7270    movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
7271    ptest(tmp3Reg, tmp1Reg);
7272    jccb(Assembler::notZero, L_copy_8_chars_exit);
7273    packuswb(tmp3Reg, tmp1Reg);
7274    movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
7275    addptr(len, 8);
7276    jccb(Assembler::lessEqual, L_copy_8_chars);
7277
7278    bind(L_copy_8_chars_exit);
7279    subptr(len, 8);
7280    jccb(Assembler::zero, L_done);
7281  }
7282
7283  bind(L_copy_1_char);
7284  load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
7285  testl(tmp5, 0xff00);      // check if Unicode char
7286  jccb(Assembler::notZero, L_copy_1_char_exit);
7287  movb(Address(dst, len, Address::times_1, 0), tmp5);
7288  addptr(len, 1);
7289  jccb(Assembler::less, L_copy_1_char);
7290
7291  bind(L_copy_1_char_exit);
7292  addptr(result, len); // len is negative count of not processed elements
7293  bind(L_done);
7294}
7295
7296/**
7297 * Emits code to update CRC-32 with a byte value according to constants in table
7298 *
7299 * @param [in,out]crc   Register containing the crc.
7300 * @param [in]val       Register containing the byte to fold into the CRC.
7301 * @param [in]table     Register containing the table of crc constants.
7302 *
7303 * uint32_t crc;
7304 * val = crc_table[(val ^ crc) & 0xFF];
7305 * crc = val ^ (crc >> 8);
7306 *
7307 */
7308void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7309  xorl(val, crc);
7310  andl(val, 0xFF);
7311  shrl(crc, 8); // unsigned shift
7312  xorl(crc, Address(table, val, Address::times_4, 0));
7313}
7314
7315/**
7316 * Fold 128-bit data chunk
7317 */
7318void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7319  vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7320  vpclmulldq(xcrc, xK, xcrc); // [63:0]
7321  vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
7322  pxor(xcrc, xtmp);
7323}
7324
7325void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7326  vpclmulhdq(xtmp, xK, xcrc);
7327  vpclmulldq(xcrc, xK, xcrc);
7328  pxor(xcrc, xbuf);
7329  pxor(xcrc, xtmp);
7330}
7331
7332/**
7333 * 8-bit folds to compute 32-bit CRC
7334 *
7335 * uint64_t xcrc;
7336 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7337 */
7338void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7339  movdl(tmp, xcrc);
7340  andl(tmp, 0xFF);
7341  movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7342  psrldq(xcrc, 1); // unsigned shift one byte
7343  pxor(xcrc, xtmp);
7344}
7345
7346/**
7347 * uint32_t crc;
7348 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7349 */
7350void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7351  movl(tmp, crc);
7352  andl(tmp, 0xFF);
7353  shrl(crc, 8);
7354  xorl(crc, Address(table, tmp, Address::times_4, 0));
7355}
7356
7357/**
7358 * @param crc   register containing existing CRC (32-bit)
7359 * @param buf   register pointing to input byte buffer (byte*)
7360 * @param len   register containing number of bytes
7361 * @param table register that will contain address of CRC table
7362 * @param tmp   scratch register
7363 */
7364void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7365  assert_different_registers(crc, buf, len, table, tmp, rax);
7366
7367  Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7368  Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7369
7370  lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7371  notl(crc); // ~crc
7372  cmpl(len, 16);
7373  jcc(Assembler::less, L_tail);
7374
7375  // Align buffer to 16 bytes
7376  movl(tmp, buf);
7377  andl(tmp, 0xF);
7378  jccb(Assembler::zero, L_aligned);
7379  subl(tmp,  16);
7380  addl(len, tmp);
7381
7382  align(4);
7383  BIND(L_align_loop);
7384  movsbl(rax, Address(buf, 0)); // load byte with sign extension
7385  update_byte_crc32(crc, rax, table);
7386  increment(buf);
7387  incrementl(tmp);
7388  jccb(Assembler::less, L_align_loop);
7389
7390  BIND(L_aligned);
7391  movl(tmp, len); // save
7392  shrl(len, 4);
7393  jcc(Assembler::zero, L_tail_restore);
7394
7395  // Fold crc into first bytes of vector
7396  movdqa(xmm1, Address(buf, 0));
7397  movdl(rax, xmm1);
7398  xorl(crc, rax);
7399  pinsrd(xmm1, crc, 0);
7400  addptr(buf, 16);
7401  subl(len, 4); // len > 0
7402  jcc(Assembler::less, L_fold_tail);
7403
7404  movdqa(xmm2, Address(buf,  0));
7405  movdqa(xmm3, Address(buf, 16));
7406  movdqa(xmm4, Address(buf, 32));
7407  addptr(buf, 48);
7408  subl(len, 3);
7409  jcc(Assembler::lessEqual, L_fold_512b);
7410
7411  // Fold total 512 bits of polynomial on each iteration,
7412  // 128 bits per each of 4 parallel streams.
7413  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
7414
7415  align(32);
7416  BIND(L_fold_512b_loop);
7417  fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7418  fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7419  fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7420  fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7421  addptr(buf, 64);
7422  subl(len, 4);
7423  jcc(Assembler::greater, L_fold_512b_loop);
7424
7425  // Fold 512 bits to 128 bits.
7426  BIND(L_fold_512b);
7427  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7428  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7429  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7430  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7431
7432  // Fold the rest of 128 bits data chunks
7433  BIND(L_fold_tail);
7434  addl(len, 3);
7435  jccb(Assembler::lessEqual, L_fold_128b);
7436  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7437
7438  BIND(L_fold_tail_loop);
7439  fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7440  addptr(buf, 16);
7441  decrementl(len);
7442  jccb(Assembler::greater, L_fold_tail_loop);
7443
7444  // Fold 128 bits in xmm1 down into 32 bits in crc register.
7445  BIND(L_fold_128b);
7446  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7447  vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7448  vpand(xmm3, xmm0, xmm2, false /* vector256 */);
7449  vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7450  psrldq(xmm1, 8);
7451  psrldq(xmm2, 4);
7452  pxor(xmm0, xmm1);
7453  pxor(xmm0, xmm2);
7454
7455  // 8 8-bit folds to compute 32-bit CRC.
7456  for (int j = 0; j < 4; j++) {
7457    fold_8bit_crc32(xmm0, table, xmm1, rax);
7458  }
7459  movdl(crc, xmm0); // mov 32 bits to general register
7460  for (int j = 0; j < 4; j++) {
7461    fold_8bit_crc32(crc, table, rax);
7462  }
7463
7464  BIND(L_tail_restore);
7465  movl(len, tmp); // restore
7466  BIND(L_tail);
7467  andl(len, 0xf);
7468  jccb(Assembler::zero, L_exit);
7469
7470  // Fold the rest of bytes
7471  align(4);
7472  BIND(L_tail_loop);
7473  movsbl(rax, Address(buf, 0)); // load byte with sign extension
7474  update_byte_crc32(crc, rax, table);
7475  increment(buf);
7476  decrementl(len);
7477  jccb(Assembler::greater, L_tail_loop);
7478
7479  BIND(L_exit);
7480  notl(crc); // ~c
7481}
7482
7483#undef BIND
7484#undef BLOCK_COMMENT
7485
7486
7487Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
7488  switch (cond) {
7489    // Note some conditions are synonyms for others
7490    case Assembler::zero:         return Assembler::notZero;
7491    case Assembler::notZero:      return Assembler::zero;
7492    case Assembler::less:         return Assembler::greaterEqual;
7493    case Assembler::lessEqual:    return Assembler::greater;
7494    case Assembler::greater:      return Assembler::lessEqual;
7495    case Assembler::greaterEqual: return Assembler::less;
7496    case Assembler::below:        return Assembler::aboveEqual;
7497    case Assembler::belowEqual:   return Assembler::above;
7498    case Assembler::above:        return Assembler::belowEqual;
7499    case Assembler::aboveEqual:   return Assembler::below;
7500    case Assembler::overflow:     return Assembler::noOverflow;
7501    case Assembler::noOverflow:   return Assembler::overflow;
7502    case Assembler::negative:     return Assembler::positive;
7503    case Assembler::positive:     return Assembler::negative;
7504    case Assembler::parity:       return Assembler::noParity;
7505    case Assembler::noParity:     return Assembler::parity;
7506  }
7507  ShouldNotReachHere(); return Assembler::overflow;
7508}
7509
7510SkipIfEqual::SkipIfEqual(
7511    MacroAssembler* masm, const bool* flag_addr, bool value) {
7512  _masm = masm;
7513  _masm->cmp8(ExternalAddress((address)flag_addr), value);
7514  _masm->jcc(Assembler::equal, _label);
7515}
7516
7517SkipIfEqual::~SkipIfEqual() {
7518  _masm->bind(_label);
7519}
7520