stubGenerator_x86_64.cpp revision 3883:cd3d6a6b95d9
1/*
2 * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "asm/macroAssembler.hpp"
27#include "asm/macroAssembler.inline.hpp"
28#include "interpreter/interpreter.hpp"
29#include "nativeInst_x86.hpp"
30#include "oops/instanceOop.hpp"
31#include "oops/method.hpp"
32#include "oops/objArrayKlass.hpp"
33#include "oops/oop.inline.hpp"
34#include "prims/methodHandles.hpp"
35#include "runtime/frame.inline.hpp"
36#include "runtime/handles.inline.hpp"
37#include "runtime/sharedRuntime.hpp"
38#include "runtime/stubCodeGenerator.hpp"
39#include "runtime/stubRoutines.hpp"
40#include "utilities/top.hpp"
41#ifdef TARGET_OS_FAMILY_linux
42# include "thread_linux.inline.hpp"
43#endif
44#ifdef TARGET_OS_FAMILY_solaris
45# include "thread_solaris.inline.hpp"
46#endif
47#ifdef TARGET_OS_FAMILY_windows
48# include "thread_windows.inline.hpp"
49#endif
50#ifdef TARGET_OS_FAMILY_bsd
51# include "thread_bsd.inline.hpp"
52#endif
53#ifdef COMPILER2
54#include "opto/runtime.hpp"
55#endif
56
57// Declaration and definition of StubGenerator (no .hpp file).
58// For a more detailed description of the stub routine structure
59// see the comment in stubRoutines.hpp
60
61#define __ _masm->
62#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
63#define a__ ((Assembler*)_masm)->
64
65#ifdef PRODUCT
66#define BLOCK_COMMENT(str) /* nothing */
67#else
68#define BLOCK_COMMENT(str) __ block_comment(str)
69#endif
70
71#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
72const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
73
74// Stub Code definitions
75
76static address handle_unsafe_access() {
77  JavaThread* thread = JavaThread::current();
78  address pc = thread->saved_exception_pc();
79  // pc is the instruction which we must emulate
80  // doing a no-op is fine:  return garbage from the load
81  // therefore, compute npc
82  address npc = Assembler::locate_next_instruction(pc);
83
84  // request an async exception
85  thread->set_pending_unsafe_access_error();
86
87  // return address of next instruction to execute
88  return npc;
89}
90
91class StubGenerator: public StubCodeGenerator {
92 private:
93
94#ifdef PRODUCT
95#define inc_counter_np(counter) (0)
96#else
97  void inc_counter_np_(int& counter) {
98    // This can destroy rscratch1 if counter is far from the code cache
99    __ incrementl(ExternalAddress((address)&counter));
100  }
101#define inc_counter_np(counter) \
102  BLOCK_COMMENT("inc_counter " #counter); \
103  inc_counter_np_(counter);
104#endif
105
106  // Call stubs are used to call Java from C
107  //
108  // Linux Arguments:
109  //    c_rarg0:   call wrapper address                   address
110  //    c_rarg1:   result                                 address
111  //    c_rarg2:   result type                            BasicType
112  //    c_rarg3:   method                                 Method*
113  //    c_rarg4:   (interpreter) entry point              address
114  //    c_rarg5:   parameters                             intptr_t*
115  //    16(rbp): parameter size (in words)              int
116  //    24(rbp): thread                                 Thread*
117  //
118  //     [ return_from_Java     ] <--- rsp
119  //     [ argument word n      ]
120  //      ...
121  // -12 [ argument word 1      ]
122  // -11 [ saved r15            ] <--- rsp_after_call
123  // -10 [ saved r14            ]
124  //  -9 [ saved r13            ]
125  //  -8 [ saved r12            ]
126  //  -7 [ saved rbx            ]
127  //  -6 [ call wrapper         ]
128  //  -5 [ result               ]
129  //  -4 [ result type          ]
130  //  -3 [ method               ]
131  //  -2 [ entry point          ]
132  //  -1 [ parameters           ]
133  //   0 [ saved rbp            ] <--- rbp
134  //   1 [ return address       ]
135  //   2 [ parameter size       ]
136  //   3 [ thread               ]
137  //
138  // Windows Arguments:
139  //    c_rarg0:   call wrapper address                   address
140  //    c_rarg1:   result                                 address
141  //    c_rarg2:   result type                            BasicType
142  //    c_rarg3:   method                                 Method*
143  //    48(rbp): (interpreter) entry point              address
144  //    56(rbp): parameters                             intptr_t*
145  //    64(rbp): parameter size (in words)              int
146  //    72(rbp): thread                                 Thread*
147  //
148  //     [ return_from_Java     ] <--- rsp
149  //     [ argument word n      ]
150  //      ...
151  // -28 [ argument word 1      ]
152  // -27 [ saved xmm15          ] <--- rsp_after_call
153  //     [ saved xmm7-xmm14     ]
154  //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
155  //  -7 [ saved r15            ]
156  //  -6 [ saved r14            ]
157  //  -5 [ saved r13            ]
158  //  -4 [ saved r12            ]
159  //  -3 [ saved rdi            ]
160  //  -2 [ saved rsi            ]
161  //  -1 [ saved rbx            ]
162  //   0 [ saved rbp            ] <--- rbp
163  //   1 [ return address       ]
164  //   2 [ call wrapper         ]
165  //   3 [ result               ]
166  //   4 [ result type          ]
167  //   5 [ method               ]
168  //   6 [ entry point          ]
169  //   7 [ parameters           ]
170  //   8 [ parameter size       ]
171  //   9 [ thread               ]
172  //
173  //    Windows reserves the callers stack space for arguments 1-4.
174  //    We spill c_rarg0-c_rarg3 to this space.
175
176  // Call stub stack layout word offsets from rbp
177  enum call_stub_layout {
178#ifdef _WIN64
179    xmm_save_first     = 6,  // save from xmm6
180    xmm_save_last      = 15, // to xmm15
181    xmm_save_base      = -9,
182    rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
183    r15_off            = -7,
184    r14_off            = -6,
185    r13_off            = -5,
186    r12_off            = -4,
187    rdi_off            = -3,
188    rsi_off            = -2,
189    rbx_off            = -1,
190    rbp_off            =  0,
191    retaddr_off        =  1,
192    call_wrapper_off   =  2,
193    result_off         =  3,
194    result_type_off    =  4,
195    method_off         =  5,
196    entry_point_off    =  6,
197    parameters_off     =  7,
198    parameter_size_off =  8,
199    thread_off         =  9
200#else
201    rsp_after_call_off = -12,
202    mxcsr_off          = rsp_after_call_off,
203    r15_off            = -11,
204    r14_off            = -10,
205    r13_off            = -9,
206    r12_off            = -8,
207    rbx_off            = -7,
208    call_wrapper_off   = -6,
209    result_off         = -5,
210    result_type_off    = -4,
211    method_off         = -3,
212    entry_point_off    = -2,
213    parameters_off     = -1,
214    rbp_off            =  0,
215    retaddr_off        =  1,
216    parameter_size_off =  2,
217    thread_off         =  3
218#endif
219  };
220
221#ifdef _WIN64
222  Address xmm_save(int reg) {
223    assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
224    return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
225  }
226#endif
227
228  address generate_call_stub(address& return_address) {
229    assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
230           (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
231           "adjust this code");
232    StubCodeMark mark(this, "StubRoutines", "call_stub");
233    address start = __ pc();
234
235    // same as in generate_catch_exception()!
236    const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
237
238    const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
239    const Address result        (rbp, result_off         * wordSize);
240    const Address result_type   (rbp, result_type_off    * wordSize);
241    const Address method        (rbp, method_off         * wordSize);
242    const Address entry_point   (rbp, entry_point_off    * wordSize);
243    const Address parameters    (rbp, parameters_off     * wordSize);
244    const Address parameter_size(rbp, parameter_size_off * wordSize);
245
246    // same as in generate_catch_exception()!
247    const Address thread        (rbp, thread_off         * wordSize);
248
249    const Address r15_save(rbp, r15_off * wordSize);
250    const Address r14_save(rbp, r14_off * wordSize);
251    const Address r13_save(rbp, r13_off * wordSize);
252    const Address r12_save(rbp, r12_off * wordSize);
253    const Address rbx_save(rbp, rbx_off * wordSize);
254
255    // stub code
256    __ enter();
257    __ subptr(rsp, -rsp_after_call_off * wordSize);
258
259    // save register parameters
260#ifndef _WIN64
261    __ movptr(parameters,   c_rarg5); // parameters
262    __ movptr(entry_point,  c_rarg4); // entry_point
263#endif
264
265    __ movptr(method,       c_rarg3); // method
266    __ movl(result_type,  c_rarg2);   // result type
267    __ movptr(result,       c_rarg1); // result
268    __ movptr(call_wrapper, c_rarg0); // call wrapper
269
270    // save regs belonging to calling function
271    __ movptr(rbx_save, rbx);
272    __ movptr(r12_save, r12);
273    __ movptr(r13_save, r13);
274    __ movptr(r14_save, r14);
275    __ movptr(r15_save, r15);
276#ifdef _WIN64
277    for (int i = 6; i <= 15; i++) {
278      __ movdqu(xmm_save(i), as_XMMRegister(i));
279    }
280
281    const Address rdi_save(rbp, rdi_off * wordSize);
282    const Address rsi_save(rbp, rsi_off * wordSize);
283
284    __ movptr(rsi_save, rsi);
285    __ movptr(rdi_save, rdi);
286#else
287    const Address mxcsr_save(rbp, mxcsr_off * wordSize);
288    {
289      Label skip_ldmx;
290      __ stmxcsr(mxcsr_save);
291      __ movl(rax, mxcsr_save);
292      __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
293      ExternalAddress mxcsr_std(StubRoutines::x86::mxcsr_std());
294      __ cmp32(rax, mxcsr_std);
295      __ jcc(Assembler::equal, skip_ldmx);
296      __ ldmxcsr(mxcsr_std);
297      __ bind(skip_ldmx);
298    }
299#endif
300
301    // Load up thread register
302    __ movptr(r15_thread, thread);
303    __ reinit_heapbase();
304
305#ifdef ASSERT
306    // make sure we have no pending exceptions
307    {
308      Label L;
309      __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
310      __ jcc(Assembler::equal, L);
311      __ stop("StubRoutines::call_stub: entered with pending exception");
312      __ bind(L);
313    }
314#endif
315
316    // pass parameters if any
317    BLOCK_COMMENT("pass parameters if any");
318    Label parameters_done;
319    __ movl(c_rarg3, parameter_size);
320    __ testl(c_rarg3, c_rarg3);
321    __ jcc(Assembler::zero, parameters_done);
322
323    Label loop;
324    __ movptr(c_rarg2, parameters);       // parameter pointer
325    __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
326    __ BIND(loop);
327    __ movptr(rax, Address(c_rarg2, 0));// get parameter
328    __ addptr(c_rarg2, wordSize);       // advance to next parameter
329    __ decrementl(c_rarg1);             // decrement counter
330    __ push(rax);                       // pass parameter
331    __ jcc(Assembler::notZero, loop);
332
333    // call Java function
334    __ BIND(parameters_done);
335    __ movptr(rbx, method);             // get Method*
336    __ movptr(c_rarg1, entry_point);    // get entry_point
337    __ mov(r13, rsp);                   // set sender sp
338    BLOCK_COMMENT("call Java function");
339    __ call(c_rarg1);
340
341    BLOCK_COMMENT("call_stub_return_address:");
342    return_address = __ pc();
343
344    // store result depending on type (everything that is not
345    // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
346    __ movptr(c_rarg0, result);
347    Label is_long, is_float, is_double, exit;
348    __ movl(c_rarg1, result_type);
349    __ cmpl(c_rarg1, T_OBJECT);
350    __ jcc(Assembler::equal, is_long);
351    __ cmpl(c_rarg1, T_LONG);
352    __ jcc(Assembler::equal, is_long);
353    __ cmpl(c_rarg1, T_FLOAT);
354    __ jcc(Assembler::equal, is_float);
355    __ cmpl(c_rarg1, T_DOUBLE);
356    __ jcc(Assembler::equal, is_double);
357
358    // handle T_INT case
359    __ movl(Address(c_rarg0, 0), rax);
360
361    __ BIND(exit);
362
363    // pop parameters
364    __ lea(rsp, rsp_after_call);
365
366#ifdef ASSERT
367    // verify that threads correspond
368    {
369      Label L, S;
370      __ cmpptr(r15_thread, thread);
371      __ jcc(Assembler::notEqual, S);
372      __ get_thread(rbx);
373      __ cmpptr(r15_thread, rbx);
374      __ jcc(Assembler::equal, L);
375      __ bind(S);
376      __ jcc(Assembler::equal, L);
377      __ stop("StubRoutines::call_stub: threads must correspond");
378      __ bind(L);
379    }
380#endif
381
382    // restore regs belonging to calling function
383#ifdef _WIN64
384    for (int i = 15; i >= 6; i--) {
385      __ movdqu(as_XMMRegister(i), xmm_save(i));
386    }
387#endif
388    __ movptr(r15, r15_save);
389    __ movptr(r14, r14_save);
390    __ movptr(r13, r13_save);
391    __ movptr(r12, r12_save);
392    __ movptr(rbx, rbx_save);
393
394#ifdef _WIN64
395    __ movptr(rdi, rdi_save);
396    __ movptr(rsi, rsi_save);
397#else
398    __ ldmxcsr(mxcsr_save);
399#endif
400
401    // restore rsp
402    __ addptr(rsp, -rsp_after_call_off * wordSize);
403
404    // return
405    __ pop(rbp);
406    __ ret(0);
407
408    // handle return types different from T_INT
409    __ BIND(is_long);
410    __ movq(Address(c_rarg0, 0), rax);
411    __ jmp(exit);
412
413    __ BIND(is_float);
414    __ movflt(Address(c_rarg0, 0), xmm0);
415    __ jmp(exit);
416
417    __ BIND(is_double);
418    __ movdbl(Address(c_rarg0, 0), xmm0);
419    __ jmp(exit);
420
421    return start;
422  }
423
424  // Return point for a Java call if there's an exception thrown in
425  // Java code.  The exception is caught and transformed into a
426  // pending exception stored in JavaThread that can be tested from
427  // within the VM.
428  //
429  // Note: Usually the parameters are removed by the callee. In case
430  // of an exception crossing an activation frame boundary, that is
431  // not the case if the callee is compiled code => need to setup the
432  // rsp.
433  //
434  // rax: exception oop
435
436  address generate_catch_exception() {
437    StubCodeMark mark(this, "StubRoutines", "catch_exception");
438    address start = __ pc();
439
440    // same as in generate_call_stub():
441    const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
442    const Address thread        (rbp, thread_off         * wordSize);
443
444#ifdef ASSERT
445    // verify that threads correspond
446    {
447      Label L, S;
448      __ cmpptr(r15_thread, thread);
449      __ jcc(Assembler::notEqual, S);
450      __ get_thread(rbx);
451      __ cmpptr(r15_thread, rbx);
452      __ jcc(Assembler::equal, L);
453      __ bind(S);
454      __ stop("StubRoutines::catch_exception: threads must correspond");
455      __ bind(L);
456    }
457#endif
458
459    // set pending exception
460    __ verify_oop(rax);
461
462    __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
463    __ lea(rscratch1, ExternalAddress((address)__FILE__));
464    __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
465    __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
466
467    // complete return to VM
468    assert(StubRoutines::_call_stub_return_address != NULL,
469           "_call_stub_return_address must have been generated before");
470    __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
471
472    return start;
473  }
474
475  // Continuation point for runtime calls returning with a pending
476  // exception.  The pending exception check happened in the runtime
477  // or native call stub.  The pending exception in Thread is
478  // converted into a Java-level exception.
479  //
480  // Contract with Java-level exception handlers:
481  // rax: exception
482  // rdx: throwing pc
483  //
484  // NOTE: At entry of this stub, exception-pc must be on stack !!
485
486  address generate_forward_exception() {
487    StubCodeMark mark(this, "StubRoutines", "forward exception");
488    address start = __ pc();
489
490    // Upon entry, the sp points to the return address returning into
491    // Java (interpreted or compiled) code; i.e., the return address
492    // becomes the throwing pc.
493    //
494    // Arguments pushed before the runtime call are still on the stack
495    // but the exception handler will reset the stack pointer ->
496    // ignore them.  A potential result in registers can be ignored as
497    // well.
498
499#ifdef ASSERT
500    // make sure this code is only executed if there is a pending exception
501    {
502      Label L;
503      __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
504      __ jcc(Assembler::notEqual, L);
505      __ stop("StubRoutines::forward exception: no pending exception (1)");
506      __ bind(L);
507    }
508#endif
509
510    // compute exception handler into rbx
511    __ movptr(c_rarg0, Address(rsp, 0));
512    BLOCK_COMMENT("call exception_handler_for_return_address");
513    __ call_VM_leaf(CAST_FROM_FN_PTR(address,
514                         SharedRuntime::exception_handler_for_return_address),
515                    r15_thread, c_rarg0);
516    __ mov(rbx, rax);
517
518    // setup rax & rdx, remove return address & clear pending exception
519    __ pop(rdx);
520    __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
521    __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
522
523#ifdef ASSERT
524    // make sure exception is set
525    {
526      Label L;
527      __ testptr(rax, rax);
528      __ jcc(Assembler::notEqual, L);
529      __ stop("StubRoutines::forward exception: no pending exception (2)");
530      __ bind(L);
531    }
532#endif
533
534    // continue at exception handler (return address removed)
535    // rax: exception
536    // rbx: exception handler
537    // rdx: throwing pc
538    __ verify_oop(rax);
539    __ jmp(rbx);
540
541    return start;
542  }
543
544  // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest)
545  //
546  // Arguments :
547  //    c_rarg0: exchange_value
548  //    c_rarg0: dest
549  //
550  // Result:
551  //    *dest <- ex, return (orig *dest)
552  address generate_atomic_xchg() {
553    StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
554    address start = __ pc();
555
556    __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow
557    __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK
558    __ ret(0);
559
560    return start;
561  }
562
563  // Support for intptr_t atomic::xchg_ptr(intptr_t exchange_value, volatile intptr_t* dest)
564  //
565  // Arguments :
566  //    c_rarg0: exchange_value
567  //    c_rarg1: dest
568  //
569  // Result:
570  //    *dest <- ex, return (orig *dest)
571  address generate_atomic_xchg_ptr() {
572    StubCodeMark mark(this, "StubRoutines", "atomic_xchg_ptr");
573    address start = __ pc();
574
575    __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
576    __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
577    __ ret(0);
578
579    return start;
580  }
581
582  // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest,
583  //                                         jint compare_value)
584  //
585  // Arguments :
586  //    c_rarg0: exchange_value
587  //    c_rarg1: dest
588  //    c_rarg2: compare_value
589  //
590  // Result:
591  //    if ( compare_value == *dest ) {
592  //       *dest = exchange_value
593  //       return compare_value;
594  //    else
595  //       return *dest;
596  address generate_atomic_cmpxchg() {
597    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
598    address start = __ pc();
599
600    __ movl(rax, c_rarg2);
601   if ( os::is_MP() ) __ lock();
602    __ cmpxchgl(c_rarg0, Address(c_rarg1, 0));
603    __ ret(0);
604
605    return start;
606  }
607
608  // Support for jint atomic::atomic_cmpxchg_long(jlong exchange_value,
609  //                                             volatile jlong* dest,
610  //                                             jlong compare_value)
611  // Arguments :
612  //    c_rarg0: exchange_value
613  //    c_rarg1: dest
614  //    c_rarg2: compare_value
615  //
616  // Result:
617  //    if ( compare_value == *dest ) {
618  //       *dest = exchange_value
619  //       return compare_value;
620  //    else
621  //       return *dest;
622  address generate_atomic_cmpxchg_long() {
623    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
624    address start = __ pc();
625
626    __ movq(rax, c_rarg2);
627   if ( os::is_MP() ) __ lock();
628    __ cmpxchgq(c_rarg0, Address(c_rarg1, 0));
629    __ ret(0);
630
631    return start;
632  }
633
634  // Support for jint atomic::add(jint add_value, volatile jint* dest)
635  //
636  // Arguments :
637  //    c_rarg0: add_value
638  //    c_rarg1: dest
639  //
640  // Result:
641  //    *dest += add_value
642  //    return *dest;
643  address generate_atomic_add() {
644    StubCodeMark mark(this, "StubRoutines", "atomic_add");
645    address start = __ pc();
646
647    __ movl(rax, c_rarg0);
648   if ( os::is_MP() ) __ lock();
649    __ xaddl(Address(c_rarg1, 0), c_rarg0);
650    __ addl(rax, c_rarg0);
651    __ ret(0);
652
653    return start;
654  }
655
656  // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest)
657  //
658  // Arguments :
659  //    c_rarg0: add_value
660  //    c_rarg1: dest
661  //
662  // Result:
663  //    *dest += add_value
664  //    return *dest;
665  address generate_atomic_add_ptr() {
666    StubCodeMark mark(this, "StubRoutines", "atomic_add_ptr");
667    address start = __ pc();
668
669    __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
670   if ( os::is_MP() ) __ lock();
671    __ xaddptr(Address(c_rarg1, 0), c_rarg0);
672    __ addptr(rax, c_rarg0);
673    __ ret(0);
674
675    return start;
676  }
677
678  // Support for intptr_t OrderAccess::fence()
679  //
680  // Arguments :
681  //
682  // Result:
683  address generate_orderaccess_fence() {
684    StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
685    address start = __ pc();
686    __ membar(Assembler::StoreLoad);
687    __ ret(0);
688
689    return start;
690  }
691
692  // Support for intptr_t get_previous_fp()
693  //
694  // This routine is used to find the previous frame pointer for the
695  // caller (current_frame_guess). This is used as part of debugging
696  // ps() is seemingly lost trying to find frames.
697  // This code assumes that caller current_frame_guess) has a frame.
698  address generate_get_previous_fp() {
699    StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
700    const Address old_fp(rbp, 0);
701    const Address older_fp(rax, 0);
702    address start = __ pc();
703
704    __ enter();
705    __ movptr(rax, old_fp); // callers fp
706    __ movptr(rax, older_fp); // the frame for ps()
707    __ pop(rbp);
708    __ ret(0);
709
710    return start;
711  }
712
713  // Support for intptr_t get_previous_sp()
714  //
715  // This routine is used to find the previous stack pointer for the
716  // caller.
717  address generate_get_previous_sp() {
718    StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
719    address start = __ pc();
720
721    __ movptr(rax, rsp);
722    __ addptr(rax, 8); // return address is at the top of the stack.
723    __ ret(0);
724
725    return start;
726  }
727
728  //----------------------------------------------------------------------------------------------------
729  // Support for void verify_mxcsr()
730  //
731  // This routine is used with -Xcheck:jni to verify that native
732  // JNI code does not return to Java code without restoring the
733  // MXCSR register to our expected state.
734
735  address generate_verify_mxcsr() {
736    StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
737    address start = __ pc();
738
739    const Address mxcsr_save(rsp, 0);
740
741    if (CheckJNICalls) {
742      Label ok_ret;
743      __ push(rax);
744      __ subptr(rsp, wordSize);      // allocate a temp location
745      __ stmxcsr(mxcsr_save);
746      __ movl(rax, mxcsr_save);
747      __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
748      __ cmpl(rax, *(int *)(StubRoutines::x86::mxcsr_std()));
749      __ jcc(Assembler::equal, ok_ret);
750
751      __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
752
753      __ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std()));
754
755      __ bind(ok_ret);
756      __ addptr(rsp, wordSize);
757      __ pop(rax);
758    }
759
760    __ ret(0);
761
762    return start;
763  }
764
765  address generate_f2i_fixup() {
766    StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
767    Address inout(rsp, 5 * wordSize); // return address + 4 saves
768
769    address start = __ pc();
770
771    Label L;
772
773    __ push(rax);
774    __ push(c_rarg3);
775    __ push(c_rarg2);
776    __ push(c_rarg1);
777
778    __ movl(rax, 0x7f800000);
779    __ xorl(c_rarg3, c_rarg3);
780    __ movl(c_rarg2, inout);
781    __ movl(c_rarg1, c_rarg2);
782    __ andl(c_rarg1, 0x7fffffff);
783    __ cmpl(rax, c_rarg1); // NaN? -> 0
784    __ jcc(Assembler::negative, L);
785    __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
786    __ movl(c_rarg3, 0x80000000);
787    __ movl(rax, 0x7fffffff);
788    __ cmovl(Assembler::positive, c_rarg3, rax);
789
790    __ bind(L);
791    __ movptr(inout, c_rarg3);
792
793    __ pop(c_rarg1);
794    __ pop(c_rarg2);
795    __ pop(c_rarg3);
796    __ pop(rax);
797
798    __ ret(0);
799
800    return start;
801  }
802
803  address generate_f2l_fixup() {
804    StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
805    Address inout(rsp, 5 * wordSize); // return address + 4 saves
806    address start = __ pc();
807
808    Label L;
809
810    __ push(rax);
811    __ push(c_rarg3);
812    __ push(c_rarg2);
813    __ push(c_rarg1);
814
815    __ movl(rax, 0x7f800000);
816    __ xorl(c_rarg3, c_rarg3);
817    __ movl(c_rarg2, inout);
818    __ movl(c_rarg1, c_rarg2);
819    __ andl(c_rarg1, 0x7fffffff);
820    __ cmpl(rax, c_rarg1); // NaN? -> 0
821    __ jcc(Assembler::negative, L);
822    __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
823    __ mov64(c_rarg3, 0x8000000000000000);
824    __ mov64(rax, 0x7fffffffffffffff);
825    __ cmov(Assembler::positive, c_rarg3, rax);
826
827    __ bind(L);
828    __ movptr(inout, c_rarg3);
829
830    __ pop(c_rarg1);
831    __ pop(c_rarg2);
832    __ pop(c_rarg3);
833    __ pop(rax);
834
835    __ ret(0);
836
837    return start;
838  }
839
840  address generate_d2i_fixup() {
841    StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
842    Address inout(rsp, 6 * wordSize); // return address + 5 saves
843
844    address start = __ pc();
845
846    Label L;
847
848    __ push(rax);
849    __ push(c_rarg3);
850    __ push(c_rarg2);
851    __ push(c_rarg1);
852    __ push(c_rarg0);
853
854    __ movl(rax, 0x7ff00000);
855    __ movq(c_rarg2, inout);
856    __ movl(c_rarg3, c_rarg2);
857    __ mov(c_rarg1, c_rarg2);
858    __ mov(c_rarg0, c_rarg2);
859    __ negl(c_rarg3);
860    __ shrptr(c_rarg1, 0x20);
861    __ orl(c_rarg3, c_rarg2);
862    __ andl(c_rarg1, 0x7fffffff);
863    __ xorl(c_rarg2, c_rarg2);
864    __ shrl(c_rarg3, 0x1f);
865    __ orl(c_rarg1, c_rarg3);
866    __ cmpl(rax, c_rarg1);
867    __ jcc(Assembler::negative, L); // NaN -> 0
868    __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
869    __ movl(c_rarg2, 0x80000000);
870    __ movl(rax, 0x7fffffff);
871    __ cmov(Assembler::positive, c_rarg2, rax);
872
873    __ bind(L);
874    __ movptr(inout, c_rarg2);
875
876    __ pop(c_rarg0);
877    __ pop(c_rarg1);
878    __ pop(c_rarg2);
879    __ pop(c_rarg3);
880    __ pop(rax);
881
882    __ ret(0);
883
884    return start;
885  }
886
887  address generate_d2l_fixup() {
888    StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
889    Address inout(rsp, 6 * wordSize); // return address + 5 saves
890
891    address start = __ pc();
892
893    Label L;
894
895    __ push(rax);
896    __ push(c_rarg3);
897    __ push(c_rarg2);
898    __ push(c_rarg1);
899    __ push(c_rarg0);
900
901    __ movl(rax, 0x7ff00000);
902    __ movq(c_rarg2, inout);
903    __ movl(c_rarg3, c_rarg2);
904    __ mov(c_rarg1, c_rarg2);
905    __ mov(c_rarg0, c_rarg2);
906    __ negl(c_rarg3);
907    __ shrptr(c_rarg1, 0x20);
908    __ orl(c_rarg3, c_rarg2);
909    __ andl(c_rarg1, 0x7fffffff);
910    __ xorl(c_rarg2, c_rarg2);
911    __ shrl(c_rarg3, 0x1f);
912    __ orl(c_rarg1, c_rarg3);
913    __ cmpl(rax, c_rarg1);
914    __ jcc(Assembler::negative, L); // NaN -> 0
915    __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
916    __ mov64(c_rarg2, 0x8000000000000000);
917    __ mov64(rax, 0x7fffffffffffffff);
918    __ cmovq(Assembler::positive, c_rarg2, rax);
919
920    __ bind(L);
921    __ movq(inout, c_rarg2);
922
923    __ pop(c_rarg0);
924    __ pop(c_rarg1);
925    __ pop(c_rarg2);
926    __ pop(c_rarg3);
927    __ pop(rax);
928
929    __ ret(0);
930
931    return start;
932  }
933
934  address generate_fp_mask(const char *stub_name, int64_t mask) {
935    __ align(CodeEntryAlignment);
936    StubCodeMark mark(this, "StubRoutines", stub_name);
937    address start = __ pc();
938
939    __ emit_data64( mask, relocInfo::none );
940    __ emit_data64( mask, relocInfo::none );
941
942    return start;
943  }
944
945  // The following routine generates a subroutine to throw an
946  // asynchronous UnknownError when an unsafe access gets a fault that
947  // could not be reasonably prevented by the programmer.  (Example:
948  // SIGBUS/OBJERR.)
949  address generate_handler_for_unsafe_access() {
950    StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
951    address start = __ pc();
952
953    __ push(0);                       // hole for return address-to-be
954    __ pusha();                       // push registers
955    Address next_pc(rsp, RegisterImpl::number_of_registers * BytesPerWord);
956
957    // FIXME: this probably needs alignment logic
958
959    __ subptr(rsp, frame::arg_reg_save_area_bytes);
960    BLOCK_COMMENT("call handle_unsafe_access");
961    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, handle_unsafe_access)));
962    __ addptr(rsp, frame::arg_reg_save_area_bytes);
963
964    __ movptr(next_pc, rax);          // stuff next address
965    __ popa();
966    __ ret(0);                        // jump to next address
967
968    return start;
969  }
970
971  // Non-destructive plausibility checks for oops
972  //
973  // Arguments:
974  //    all args on stack!
975  //
976  // Stack after saving c_rarg3:
977  //    [tos + 0]: saved c_rarg3
978  //    [tos + 1]: saved c_rarg2
979  //    [tos + 2]: saved r12 (several TemplateTable methods use it)
980  //    [tos + 3]: saved flags
981  //    [tos + 4]: return address
982  //  * [tos + 5]: error message (char*)
983  //  * [tos + 6]: object to verify (oop)
984  //  * [tos + 7]: saved rax - saved by caller and bashed
985  //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
986  //  * = popped on exit
987  address generate_verify_oop() {
988    StubCodeMark mark(this, "StubRoutines", "verify_oop");
989    address start = __ pc();
990
991    Label exit, error;
992
993    __ pushf();
994    __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
995
996    __ push(r12);
997
998    // save c_rarg2 and c_rarg3
999    __ push(c_rarg2);
1000    __ push(c_rarg3);
1001
1002    enum {
1003           // After previous pushes.
1004           oop_to_verify = 6 * wordSize,
1005           saved_rax     = 7 * wordSize,
1006           saved_r10     = 8 * wordSize,
1007
1008           // Before the call to MacroAssembler::debug(), see below.
1009           return_addr   = 16 * wordSize,
1010           error_msg     = 17 * wordSize
1011    };
1012
1013    // get object
1014    __ movptr(rax, Address(rsp, oop_to_verify));
1015
1016    // make sure object is 'reasonable'
1017    __ testptr(rax, rax);
1018    __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
1019    // Check if the oop is in the right area of memory
1020    __ movptr(c_rarg2, rax);
1021    __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1022    __ andptr(c_rarg2, c_rarg3);
1023    __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1024    __ cmpptr(c_rarg2, c_rarg3);
1025    __ jcc(Assembler::notZero, error);
1026
1027    // set r12 to heapbase for load_klass()
1028    __ reinit_heapbase();
1029
1030    // make sure klass is 'reasonable', which is not zero.
1031    __ load_klass(rax, rax);  // get klass
1032    __ testptr(rax, rax);
1033    __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1034    // TODO: Future assert that klass is lower 4g memory for UseCompressedKlassPointers
1035
1036    // return if everything seems ok
1037    __ bind(exit);
1038    __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1039    __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1040    __ pop(c_rarg3);                             // restore c_rarg3
1041    __ pop(c_rarg2);                             // restore c_rarg2
1042    __ pop(r12);                                 // restore r12
1043    __ popf();                                   // restore flags
1044    __ ret(4 * wordSize);                        // pop caller saved stuff
1045
1046    // handle errors
1047    __ bind(error);
1048    __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1049    __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1050    __ pop(c_rarg3);                             // get saved c_rarg3 back
1051    __ pop(c_rarg2);                             // get saved c_rarg2 back
1052    __ pop(r12);                                 // get saved r12 back
1053    __ popf();                                   // get saved flags off stack --
1054                                                 // will be ignored
1055
1056    __ pusha();                                  // push registers
1057                                                 // (rip is already
1058                                                 // already pushed)
1059    // debug(char* msg, int64_t pc, int64_t regs[])
1060    // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1061    // pushed all the registers, so now the stack looks like:
1062    //     [tos +  0] 16 saved registers
1063    //     [tos + 16] return address
1064    //   * [tos + 17] error message (char*)
1065    //   * [tos + 18] object to verify (oop)
1066    //   * [tos + 19] saved rax - saved by caller and bashed
1067    //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1068    //   * = popped on exit
1069
1070    __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1071    __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1072    __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1073    __ mov(r12, rsp);                               // remember rsp
1074    __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1075    __ andptr(rsp, -16);                            // align stack as required by ABI
1076    BLOCK_COMMENT("call MacroAssembler::debug");
1077    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1078    __ mov(rsp, r12);                               // restore rsp
1079    __ popa();                                      // pop registers (includes r12)
1080    __ ret(4 * wordSize);                           // pop caller saved stuff
1081
1082    return start;
1083  }
1084
1085  //
1086  // Verify that a register contains clean 32-bits positive value
1087  // (high 32-bits are 0) so it could be used in 64-bits shifts.
1088  //
1089  //  Input:
1090  //    Rint  -  32-bits value
1091  //    Rtmp  -  scratch
1092  //
1093  void assert_clean_int(Register Rint, Register Rtmp) {
1094#ifdef ASSERT
1095    Label L;
1096    assert_different_registers(Rtmp, Rint);
1097    __ movslq(Rtmp, Rint);
1098    __ cmpq(Rtmp, Rint);
1099    __ jcc(Assembler::equal, L);
1100    __ stop("high 32-bits of int value are not 0");
1101    __ bind(L);
1102#endif
1103  }
1104
1105  //  Generate overlap test for array copy stubs
1106  //
1107  //  Input:
1108  //     c_rarg0 - from
1109  //     c_rarg1 - to
1110  //     c_rarg2 - element count
1111  //
1112  //  Output:
1113  //     rax   - &from[element count - 1]
1114  //
1115  void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1116    assert(no_overlap_target != NULL, "must be generated");
1117    array_overlap_test(no_overlap_target, NULL, sf);
1118  }
1119  void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1120    array_overlap_test(NULL, &L_no_overlap, sf);
1121  }
1122  void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1123    const Register from     = c_rarg0;
1124    const Register to       = c_rarg1;
1125    const Register count    = c_rarg2;
1126    const Register end_from = rax;
1127
1128    __ cmpptr(to, from);
1129    __ lea(end_from, Address(from, count, sf, 0));
1130    if (NOLp == NULL) {
1131      ExternalAddress no_overlap(no_overlap_target);
1132      __ jump_cc(Assembler::belowEqual, no_overlap);
1133      __ cmpptr(to, end_from);
1134      __ jump_cc(Assembler::aboveEqual, no_overlap);
1135    } else {
1136      __ jcc(Assembler::belowEqual, (*NOLp));
1137      __ cmpptr(to, end_from);
1138      __ jcc(Assembler::aboveEqual, (*NOLp));
1139    }
1140  }
1141
1142  // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1143  //
1144  // Outputs:
1145  //    rdi - rcx
1146  //    rsi - rdx
1147  //    rdx - r8
1148  //    rcx - r9
1149  //
1150  // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1151  // are non-volatile.  r9 and r10 should not be used by the caller.
1152  //
1153  void setup_arg_regs(int nargs = 3) {
1154    const Register saved_rdi = r9;
1155    const Register saved_rsi = r10;
1156    assert(nargs == 3 || nargs == 4, "else fix");
1157#ifdef _WIN64
1158    assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1159           "unexpected argument registers");
1160    if (nargs >= 4)
1161      __ mov(rax, r9);  // r9 is also saved_rdi
1162    __ movptr(saved_rdi, rdi);
1163    __ movptr(saved_rsi, rsi);
1164    __ mov(rdi, rcx); // c_rarg0
1165    __ mov(rsi, rdx); // c_rarg1
1166    __ mov(rdx, r8);  // c_rarg2
1167    if (nargs >= 4)
1168      __ mov(rcx, rax); // c_rarg3 (via rax)
1169#else
1170    assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1171           "unexpected argument registers");
1172#endif
1173  }
1174
1175  void restore_arg_regs() {
1176    const Register saved_rdi = r9;
1177    const Register saved_rsi = r10;
1178#ifdef _WIN64
1179    __ movptr(rdi, saved_rdi);
1180    __ movptr(rsi, saved_rsi);
1181#endif
1182  }
1183
1184  // Generate code for an array write pre barrier
1185  //
1186  //     addr    -  starting address
1187  //     count   -  element count
1188  //     tmp     - scratch register
1189  //
1190  //     Destroy no registers!
1191  //
1192  void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
1193    BarrierSet* bs = Universe::heap()->barrier_set();
1194    switch (bs->kind()) {
1195      case BarrierSet::G1SATBCT:
1196      case BarrierSet::G1SATBCTLogging:
1197        // With G1, don't generate the call if we statically know that the target in uninitialized
1198        if (!dest_uninitialized) {
1199           __ pusha();                      // push registers
1200           if (count == c_rarg0) {
1201             if (addr == c_rarg1) {
1202               // exactly backwards!!
1203               __ xchgptr(c_rarg1, c_rarg0);
1204             } else {
1205               __ movptr(c_rarg1, count);
1206               __ movptr(c_rarg0, addr);
1207             }
1208           } else {
1209             __ movptr(c_rarg0, addr);
1210             __ movptr(c_rarg1, count);
1211           }
1212           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
1213           __ popa();
1214        }
1215         break;
1216      case BarrierSet::CardTableModRef:
1217      case BarrierSet::CardTableExtension:
1218      case BarrierSet::ModRef:
1219        break;
1220      default:
1221        ShouldNotReachHere();
1222
1223    }
1224  }
1225
1226  //
1227  // Generate code for an array write post barrier
1228  //
1229  //  Input:
1230  //     start    - register containing starting address of destination array
1231  //     end      - register containing ending address of destination array
1232  //     scratch  - scratch register
1233  //
1234  //  The input registers are overwritten.
1235  //  The ending address is inclusive.
1236  void  gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
1237    assert_different_registers(start, end, scratch);
1238    BarrierSet* bs = Universe::heap()->barrier_set();
1239    switch (bs->kind()) {
1240      case BarrierSet::G1SATBCT:
1241      case BarrierSet::G1SATBCTLogging:
1242
1243        {
1244          __ pusha();                      // push registers (overkill)
1245          // must compute element count unless barrier set interface is changed (other platforms supply count)
1246          assert_different_registers(start, end, scratch);
1247          __ lea(scratch, Address(end, BytesPerHeapOop));
1248          __ subptr(scratch, start);               // subtract start to get #bytes
1249          __ shrptr(scratch, LogBytesPerHeapOop);  // convert to element count
1250          __ mov(c_rarg0, start);
1251          __ mov(c_rarg1, scratch);
1252          __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
1253          __ popa();
1254        }
1255        break;
1256      case BarrierSet::CardTableModRef:
1257      case BarrierSet::CardTableExtension:
1258        {
1259          CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1260          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1261
1262          Label L_loop;
1263
1264           __ shrptr(start, CardTableModRefBS::card_shift);
1265           __ addptr(end, BytesPerHeapOop);
1266           __ shrptr(end, CardTableModRefBS::card_shift);
1267           __ subptr(end, start); // number of bytes to copy
1268
1269          intptr_t disp = (intptr_t) ct->byte_map_base;
1270          if (Assembler::is_simm32(disp)) {
1271            Address cardtable(noreg, noreg, Address::no_scale, disp);
1272            __ lea(scratch, cardtable);
1273          } else {
1274            ExternalAddress cardtable((address)disp);
1275            __ lea(scratch, cardtable);
1276          }
1277
1278          const Register count = end; // 'end' register contains bytes count now
1279          __ addptr(start, scratch);
1280        __ BIND(L_loop);
1281          __ movb(Address(start, count, Address::times_1), 0);
1282          __ decrement(count);
1283          __ jcc(Assembler::greaterEqual, L_loop);
1284        }
1285        break;
1286      default:
1287        ShouldNotReachHere();
1288
1289    }
1290  }
1291
1292
1293  // Copy big chunks forward
1294  //
1295  // Inputs:
1296  //   end_from     - source arrays end address
1297  //   end_to       - destination array end address
1298  //   qword_count  - 64-bits element count, negative
1299  //   to           - scratch
1300  //   L_copy_32_bytes - entry label
1301  //   L_copy_8_bytes  - exit  label
1302  //
1303  void copy_32_bytes_forward(Register end_from, Register end_to,
1304                             Register qword_count, Register to,
1305                             Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
1306    DEBUG_ONLY(__ stop("enter at entry label, not here"));
1307    Label L_loop;
1308    __ align(OptoLoopAlignment);
1309  __ BIND(L_loop);
1310    if(UseUnalignedLoadStores) {
1311      __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1312      __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1313      __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1314      __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1315
1316    } else {
1317      __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1318      __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1319      __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1320      __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1321      __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1322      __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1323      __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1324      __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1325    }
1326  __ BIND(L_copy_32_bytes);
1327    __ addptr(qword_count, 4);
1328    __ jcc(Assembler::lessEqual, L_loop);
1329    __ subptr(qword_count, 4);
1330    __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1331  }
1332
1333
1334  // Copy big chunks backward
1335  //
1336  // Inputs:
1337  //   from         - source arrays address
1338  //   dest         - destination array address
1339  //   qword_count  - 64-bits element count
1340  //   to           - scratch
1341  //   L_copy_32_bytes - entry label
1342  //   L_copy_8_bytes  - exit  label
1343  //
1344  void copy_32_bytes_backward(Register from, Register dest,
1345                              Register qword_count, Register to,
1346                              Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
1347    DEBUG_ONLY(__ stop("enter at entry label, not here"));
1348    Label L_loop;
1349    __ align(OptoLoopAlignment);
1350  __ BIND(L_loop);
1351    if(UseUnalignedLoadStores) {
1352      __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1353      __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1354      __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1355      __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1356
1357    } else {
1358      __ movq(to, Address(from, qword_count, Address::times_8, 24));
1359      __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1360      __ movq(to, Address(from, qword_count, Address::times_8, 16));
1361      __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1362      __ movq(to, Address(from, qword_count, Address::times_8,  8));
1363      __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1364      __ movq(to, Address(from, qword_count, Address::times_8,  0));
1365      __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1366    }
1367  __ BIND(L_copy_32_bytes);
1368    __ subptr(qword_count, 4);
1369    __ jcc(Assembler::greaterEqual, L_loop);
1370    __ addptr(qword_count, 4);
1371    __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1372  }
1373
1374
1375  // Arguments:
1376  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1377  //             ignored
1378  //   name    - stub name string
1379  //
1380  // Inputs:
1381  //   c_rarg0   - source array address
1382  //   c_rarg1   - destination array address
1383  //   c_rarg2   - element count, treated as ssize_t, can be zero
1384  //
1385  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1386  // we let the hardware handle it.  The one to eight bytes within words,
1387  // dwords or qwords that span cache line boundaries will still be loaded
1388  // and stored atomically.
1389  //
1390  // Side Effects:
1391  //   disjoint_byte_copy_entry is set to the no-overlap entry point
1392  //   used by generate_conjoint_byte_copy().
1393  //
1394  address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1395    __ align(CodeEntryAlignment);
1396    StubCodeMark mark(this, "StubRoutines", name);
1397    address start = __ pc();
1398
1399    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1400    Label L_copy_byte, L_exit;
1401    const Register from        = rdi;  // source array address
1402    const Register to          = rsi;  // destination array address
1403    const Register count       = rdx;  // elements count
1404    const Register byte_count  = rcx;
1405    const Register qword_count = count;
1406    const Register end_from    = from; // source array end address
1407    const Register end_to      = to;   // destination array end address
1408    // End pointers are inclusive, and if count is not zero they point
1409    // to the last unit copied:  end_to[0] := end_from[0]
1410
1411    __ enter(); // required for proper stackwalking of RuntimeStub frame
1412    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1413
1414    if (entry != NULL) {
1415      *entry = __ pc();
1416       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1417      BLOCK_COMMENT("Entry:");
1418    }
1419
1420    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1421                      // r9 and r10 may be used to save non-volatile registers
1422
1423    // 'from', 'to' and 'count' are now valid
1424    __ movptr(byte_count, count);
1425    __ shrptr(count, 3); // count => qword_count
1426
1427    // Copy from low to high addresses.  Use 'to' as scratch.
1428    __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1429    __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1430    __ negptr(qword_count); // make the count negative
1431    __ jmp(L_copy_32_bytes);
1432
1433    // Copy trailing qwords
1434  __ BIND(L_copy_8_bytes);
1435    __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1436    __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1437    __ increment(qword_count);
1438    __ jcc(Assembler::notZero, L_copy_8_bytes);
1439
1440    // Check for and copy trailing dword
1441  __ BIND(L_copy_4_bytes);
1442    __ testl(byte_count, 4);
1443    __ jccb(Assembler::zero, L_copy_2_bytes);
1444    __ movl(rax, Address(end_from, 8));
1445    __ movl(Address(end_to, 8), rax);
1446
1447    __ addptr(end_from, 4);
1448    __ addptr(end_to, 4);
1449
1450    // Check for and copy trailing word
1451  __ BIND(L_copy_2_bytes);
1452    __ testl(byte_count, 2);
1453    __ jccb(Assembler::zero, L_copy_byte);
1454    __ movw(rax, Address(end_from, 8));
1455    __ movw(Address(end_to, 8), rax);
1456
1457    __ addptr(end_from, 2);
1458    __ addptr(end_to, 2);
1459
1460    // Check for and copy trailing byte
1461  __ BIND(L_copy_byte);
1462    __ testl(byte_count, 1);
1463    __ jccb(Assembler::zero, L_exit);
1464    __ movb(rax, Address(end_from, 8));
1465    __ movb(Address(end_to, 8), rax);
1466
1467  __ BIND(L_exit);
1468    restore_arg_regs();
1469    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1470    __ xorptr(rax, rax); // return 0
1471    __ leave(); // required for proper stackwalking of RuntimeStub frame
1472    __ ret(0);
1473
1474    // Copy in 32-bytes chunks
1475    copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1476    __ jmp(L_copy_4_bytes);
1477
1478    return start;
1479  }
1480
1481  // Arguments:
1482  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1483  //             ignored
1484  //   name    - stub name string
1485  //
1486  // Inputs:
1487  //   c_rarg0   - source array address
1488  //   c_rarg1   - destination array address
1489  //   c_rarg2   - element count, treated as ssize_t, can be zero
1490  //
1491  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1492  // we let the hardware handle it.  The one to eight bytes within words,
1493  // dwords or qwords that span cache line boundaries will still be loaded
1494  // and stored atomically.
1495  //
1496  address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1497                                      address* entry, const char *name) {
1498    __ align(CodeEntryAlignment);
1499    StubCodeMark mark(this, "StubRoutines", name);
1500    address start = __ pc();
1501
1502    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1503    const Register from        = rdi;  // source array address
1504    const Register to          = rsi;  // destination array address
1505    const Register count       = rdx;  // elements count
1506    const Register byte_count  = rcx;
1507    const Register qword_count = count;
1508
1509    __ enter(); // required for proper stackwalking of RuntimeStub frame
1510    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1511
1512    if (entry != NULL) {
1513      *entry = __ pc();
1514      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1515      BLOCK_COMMENT("Entry:");
1516    }
1517
1518    array_overlap_test(nooverlap_target, Address::times_1);
1519    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1520                      // r9 and r10 may be used to save non-volatile registers
1521
1522    // 'from', 'to' and 'count' are now valid
1523    __ movptr(byte_count, count);
1524    __ shrptr(count, 3);   // count => qword_count
1525
1526    // Copy from high to low addresses.
1527
1528    // Check for and copy trailing byte
1529    __ testl(byte_count, 1);
1530    __ jcc(Assembler::zero, L_copy_2_bytes);
1531    __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1532    __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1533    __ decrement(byte_count); // Adjust for possible trailing word
1534
1535    // Check for and copy trailing word
1536  __ BIND(L_copy_2_bytes);
1537    __ testl(byte_count, 2);
1538    __ jcc(Assembler::zero, L_copy_4_bytes);
1539    __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1540    __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1541
1542    // Check for and copy trailing dword
1543  __ BIND(L_copy_4_bytes);
1544    __ testl(byte_count, 4);
1545    __ jcc(Assembler::zero, L_copy_32_bytes);
1546    __ movl(rax, Address(from, qword_count, Address::times_8));
1547    __ movl(Address(to, qword_count, Address::times_8), rax);
1548    __ jmp(L_copy_32_bytes);
1549
1550    // Copy trailing qwords
1551  __ BIND(L_copy_8_bytes);
1552    __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1553    __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1554    __ decrement(qword_count);
1555    __ jcc(Assembler::notZero, L_copy_8_bytes);
1556
1557    restore_arg_regs();
1558    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1559    __ xorptr(rax, rax); // return 0
1560    __ leave(); // required for proper stackwalking of RuntimeStub frame
1561    __ ret(0);
1562
1563    // Copy in 32-bytes chunks
1564    copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1565
1566    restore_arg_regs();
1567    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1568    __ xorptr(rax, rax); // return 0
1569    __ leave(); // required for proper stackwalking of RuntimeStub frame
1570    __ ret(0);
1571
1572    return start;
1573  }
1574
1575  // Arguments:
1576  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1577  //             ignored
1578  //   name    - stub name string
1579  //
1580  // Inputs:
1581  //   c_rarg0   - source array address
1582  //   c_rarg1   - destination array address
1583  //   c_rarg2   - element count, treated as ssize_t, can be zero
1584  //
1585  // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1586  // let the hardware handle it.  The two or four words within dwords
1587  // or qwords that span cache line boundaries will still be loaded
1588  // and stored atomically.
1589  //
1590  // Side Effects:
1591  //   disjoint_short_copy_entry is set to the no-overlap entry point
1592  //   used by generate_conjoint_short_copy().
1593  //
1594  address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1595    __ align(CodeEntryAlignment);
1596    StubCodeMark mark(this, "StubRoutines", name);
1597    address start = __ pc();
1598
1599    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1600    const Register from        = rdi;  // source array address
1601    const Register to          = rsi;  // destination array address
1602    const Register count       = rdx;  // elements count
1603    const Register word_count  = rcx;
1604    const Register qword_count = count;
1605    const Register end_from    = from; // source array end address
1606    const Register end_to      = to;   // destination array end address
1607    // End pointers are inclusive, and if count is not zero they point
1608    // to the last unit copied:  end_to[0] := end_from[0]
1609
1610    __ enter(); // required for proper stackwalking of RuntimeStub frame
1611    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1612
1613    if (entry != NULL) {
1614      *entry = __ pc();
1615      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1616      BLOCK_COMMENT("Entry:");
1617    }
1618
1619    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1620                      // r9 and r10 may be used to save non-volatile registers
1621
1622    // 'from', 'to' and 'count' are now valid
1623    __ movptr(word_count, count);
1624    __ shrptr(count, 2); // count => qword_count
1625
1626    // Copy from low to high addresses.  Use 'to' as scratch.
1627    __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1628    __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1629    __ negptr(qword_count);
1630    __ jmp(L_copy_32_bytes);
1631
1632    // Copy trailing qwords
1633  __ BIND(L_copy_8_bytes);
1634    __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1635    __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1636    __ increment(qword_count);
1637    __ jcc(Assembler::notZero, L_copy_8_bytes);
1638
1639    // Original 'dest' is trashed, so we can't use it as a
1640    // base register for a possible trailing word copy
1641
1642    // Check for and copy trailing dword
1643  __ BIND(L_copy_4_bytes);
1644    __ testl(word_count, 2);
1645    __ jccb(Assembler::zero, L_copy_2_bytes);
1646    __ movl(rax, Address(end_from, 8));
1647    __ movl(Address(end_to, 8), rax);
1648
1649    __ addptr(end_from, 4);
1650    __ addptr(end_to, 4);
1651
1652    // Check for and copy trailing word
1653  __ BIND(L_copy_2_bytes);
1654    __ testl(word_count, 1);
1655    __ jccb(Assembler::zero, L_exit);
1656    __ movw(rax, Address(end_from, 8));
1657    __ movw(Address(end_to, 8), rax);
1658
1659  __ BIND(L_exit);
1660    restore_arg_regs();
1661    inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1662    __ xorptr(rax, rax); // return 0
1663    __ leave(); // required for proper stackwalking of RuntimeStub frame
1664    __ ret(0);
1665
1666    // Copy in 32-bytes chunks
1667    copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1668    __ jmp(L_copy_4_bytes);
1669
1670    return start;
1671  }
1672
1673  address generate_fill(BasicType t, bool aligned, const char *name) {
1674    __ align(CodeEntryAlignment);
1675    StubCodeMark mark(this, "StubRoutines", name);
1676    address start = __ pc();
1677
1678    BLOCK_COMMENT("Entry:");
1679
1680    const Register to       = c_rarg0;  // source array address
1681    const Register value    = c_rarg1;  // value
1682    const Register count    = c_rarg2;  // elements count
1683
1684    __ enter(); // required for proper stackwalking of RuntimeStub frame
1685
1686    __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1687
1688    __ leave(); // required for proper stackwalking of RuntimeStub frame
1689    __ ret(0);
1690    return start;
1691  }
1692
1693  // Arguments:
1694  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1695  //             ignored
1696  //   name    - stub name string
1697  //
1698  // Inputs:
1699  //   c_rarg0   - source array address
1700  //   c_rarg1   - destination array address
1701  //   c_rarg2   - element count, treated as ssize_t, can be zero
1702  //
1703  // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1704  // let the hardware handle it.  The two or four words within dwords
1705  // or qwords that span cache line boundaries will still be loaded
1706  // and stored atomically.
1707  //
1708  address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1709                                       address *entry, const char *name) {
1710    __ align(CodeEntryAlignment);
1711    StubCodeMark mark(this, "StubRoutines", name);
1712    address start = __ pc();
1713
1714    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes;
1715    const Register from        = rdi;  // source array address
1716    const Register to          = rsi;  // destination array address
1717    const Register count       = rdx;  // elements count
1718    const Register word_count  = rcx;
1719    const Register qword_count = count;
1720
1721    __ enter(); // required for proper stackwalking of RuntimeStub frame
1722    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1723
1724    if (entry != NULL) {
1725      *entry = __ pc();
1726      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1727      BLOCK_COMMENT("Entry:");
1728    }
1729
1730    array_overlap_test(nooverlap_target, Address::times_2);
1731    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1732                      // r9 and r10 may be used to save non-volatile registers
1733
1734    // 'from', 'to' and 'count' are now valid
1735    __ movptr(word_count, count);
1736    __ shrptr(count, 2); // count => qword_count
1737
1738    // Copy from high to low addresses.  Use 'to' as scratch.
1739
1740    // Check for and copy trailing word
1741    __ testl(word_count, 1);
1742    __ jccb(Assembler::zero, L_copy_4_bytes);
1743    __ movw(rax, Address(from, word_count, Address::times_2, -2));
1744    __ movw(Address(to, word_count, Address::times_2, -2), rax);
1745
1746    // Check for and copy trailing dword
1747  __ BIND(L_copy_4_bytes);
1748    __ testl(word_count, 2);
1749    __ jcc(Assembler::zero, L_copy_32_bytes);
1750    __ movl(rax, Address(from, qword_count, Address::times_8));
1751    __ movl(Address(to, qword_count, Address::times_8), rax);
1752    __ jmp(L_copy_32_bytes);
1753
1754    // Copy trailing qwords
1755  __ BIND(L_copy_8_bytes);
1756    __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1757    __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1758    __ decrement(qword_count);
1759    __ jcc(Assembler::notZero, L_copy_8_bytes);
1760
1761    restore_arg_regs();
1762    inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1763    __ xorptr(rax, rax); // return 0
1764    __ leave(); // required for proper stackwalking of RuntimeStub frame
1765    __ ret(0);
1766
1767    // Copy in 32-bytes chunks
1768    copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1769
1770    restore_arg_regs();
1771    inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1772    __ xorptr(rax, rax); // return 0
1773    __ leave(); // required for proper stackwalking of RuntimeStub frame
1774    __ ret(0);
1775
1776    return start;
1777  }
1778
1779  // Arguments:
1780  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1781  //             ignored
1782  //   is_oop  - true => oop array, so generate store check code
1783  //   name    - stub name string
1784  //
1785  // Inputs:
1786  //   c_rarg0   - source array address
1787  //   c_rarg1   - destination array address
1788  //   c_rarg2   - element count, treated as ssize_t, can be zero
1789  //
1790  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1791  // the hardware handle it.  The two dwords within qwords that span
1792  // cache line boundaries will still be loaded and stored atomicly.
1793  //
1794  // Side Effects:
1795  //   disjoint_int_copy_entry is set to the no-overlap entry point
1796  //   used by generate_conjoint_int_oop_copy().
1797  //
1798  address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
1799                                         const char *name, bool dest_uninitialized = false) {
1800    __ align(CodeEntryAlignment);
1801    StubCodeMark mark(this, "StubRoutines", name);
1802    address start = __ pc();
1803
1804    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1805    const Register from        = rdi;  // source array address
1806    const Register to          = rsi;  // destination array address
1807    const Register count       = rdx;  // elements count
1808    const Register dword_count = rcx;
1809    const Register qword_count = count;
1810    const Register end_from    = from; // source array end address
1811    const Register end_to      = to;   // destination array end address
1812    const Register saved_to    = r11;  // saved destination array address
1813    // End pointers are inclusive, and if count is not zero they point
1814    // to the last unit copied:  end_to[0] := end_from[0]
1815
1816    __ enter(); // required for proper stackwalking of RuntimeStub frame
1817    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1818
1819    if (entry != NULL) {
1820      *entry = __ pc();
1821      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1822      BLOCK_COMMENT("Entry:");
1823    }
1824
1825    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1826                      // r9 and r10 may be used to save non-volatile registers
1827    if (is_oop) {
1828      __ movq(saved_to, to);
1829      gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1830    }
1831
1832    // 'from', 'to' and 'count' are now valid
1833    __ movptr(dword_count, count);
1834    __ shrptr(count, 1); // count => qword_count
1835
1836    // Copy from low to high addresses.  Use 'to' as scratch.
1837    __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1838    __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1839    __ negptr(qword_count);
1840    __ jmp(L_copy_32_bytes);
1841
1842    // Copy trailing qwords
1843  __ BIND(L_copy_8_bytes);
1844    __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1845    __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1846    __ increment(qword_count);
1847    __ jcc(Assembler::notZero, L_copy_8_bytes);
1848
1849    // Check for and copy trailing dword
1850  __ BIND(L_copy_4_bytes);
1851    __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1852    __ jccb(Assembler::zero, L_exit);
1853    __ movl(rax, Address(end_from, 8));
1854    __ movl(Address(end_to, 8), rax);
1855
1856  __ BIND(L_exit);
1857    if (is_oop) {
1858      __ leaq(end_to, Address(saved_to, dword_count, Address::times_4, -4));
1859      gen_write_ref_array_post_barrier(saved_to, end_to, rax);
1860    }
1861    restore_arg_regs();
1862    inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1863    __ xorptr(rax, rax); // return 0
1864    __ leave(); // required for proper stackwalking of RuntimeStub frame
1865    __ ret(0);
1866
1867    // Copy 32-bytes chunks
1868    copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1869    __ jmp(L_copy_4_bytes);
1870
1871    return start;
1872  }
1873
1874  // Arguments:
1875  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1876  //             ignored
1877  //   is_oop  - true => oop array, so generate store check code
1878  //   name    - stub name string
1879  //
1880  // Inputs:
1881  //   c_rarg0   - source array address
1882  //   c_rarg1   - destination array address
1883  //   c_rarg2   - element count, treated as ssize_t, can be zero
1884  //
1885  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1886  // the hardware handle it.  The two dwords within qwords that span
1887  // cache line boundaries will still be loaded and stored atomicly.
1888  //
1889  address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
1890                                         address *entry, const char *name,
1891                                         bool dest_uninitialized = false) {
1892    __ align(CodeEntryAlignment);
1893    StubCodeMark mark(this, "StubRoutines", name);
1894    address start = __ pc();
1895
1896    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
1897    const Register from        = rdi;  // source array address
1898    const Register to          = rsi;  // destination array address
1899    const Register count       = rdx;  // elements count
1900    const Register dword_count = rcx;
1901    const Register qword_count = count;
1902
1903    __ enter(); // required for proper stackwalking of RuntimeStub frame
1904    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1905
1906    if (entry != NULL) {
1907      *entry = __ pc();
1908       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1909      BLOCK_COMMENT("Entry:");
1910    }
1911
1912    array_overlap_test(nooverlap_target, Address::times_4);
1913    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1914                      // r9 and r10 may be used to save non-volatile registers
1915
1916    if (is_oop) {
1917      // no registers are destroyed by this call
1918      gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1919    }
1920
1921    assert_clean_int(count, rax); // Make sure 'count' is clean int.
1922    // 'from', 'to' and 'count' are now valid
1923    __ movptr(dword_count, count);
1924    __ shrptr(count, 1); // count => qword_count
1925
1926    // Copy from high to low addresses.  Use 'to' as scratch.
1927
1928    // Check for and copy trailing dword
1929    __ testl(dword_count, 1);
1930    __ jcc(Assembler::zero, L_copy_32_bytes);
1931    __ movl(rax, Address(from, dword_count, Address::times_4, -4));
1932    __ movl(Address(to, dword_count, Address::times_4, -4), rax);
1933    __ jmp(L_copy_32_bytes);
1934
1935    // Copy trailing qwords
1936  __ BIND(L_copy_8_bytes);
1937    __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1938    __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1939    __ decrement(qword_count);
1940    __ jcc(Assembler::notZero, L_copy_8_bytes);
1941
1942    if (is_oop) {
1943      __ jmp(L_exit);
1944    }
1945    restore_arg_regs();
1946    inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1947    __ xorptr(rax, rax); // return 0
1948    __ leave(); // required for proper stackwalking of RuntimeStub frame
1949    __ ret(0);
1950
1951    // Copy in 32-bytes chunks
1952    copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1953
1954   __ bind(L_exit);
1955     if (is_oop) {
1956       Register end_to = rdx;
1957       __ leaq(end_to, Address(to, dword_count, Address::times_4, -4));
1958       gen_write_ref_array_post_barrier(to, end_to, rax);
1959     }
1960    restore_arg_regs();
1961    inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1962    __ xorptr(rax, rax); // return 0
1963    __ leave(); // required for proper stackwalking of RuntimeStub frame
1964    __ ret(0);
1965
1966    return start;
1967  }
1968
1969  // Arguments:
1970  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1971  //             ignored
1972  //   is_oop  - true => oop array, so generate store check code
1973  //   name    - stub name string
1974  //
1975  // Inputs:
1976  //   c_rarg0   - source array address
1977  //   c_rarg1   - destination array address
1978  //   c_rarg2   - element count, treated as ssize_t, can be zero
1979  //
1980 // Side Effects:
1981  //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1982  //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1983  //
1984  address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
1985                                          const char *name, bool dest_uninitialized = false) {
1986    __ align(CodeEntryAlignment);
1987    StubCodeMark mark(this, "StubRoutines", name);
1988    address start = __ pc();
1989
1990    Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
1991    const Register from        = rdi;  // source array address
1992    const Register to          = rsi;  // destination array address
1993    const Register qword_count = rdx;  // elements count
1994    const Register end_from    = from; // source array end address
1995    const Register end_to      = rcx;  // destination array end address
1996    const Register saved_to    = to;
1997    // End pointers are inclusive, and if count is not zero they point
1998    // to the last unit copied:  end_to[0] := end_from[0]
1999
2000    __ enter(); // required for proper stackwalking of RuntimeStub frame
2001    // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2002    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2003
2004    if (entry != NULL) {
2005      *entry = __ pc();
2006      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2007      BLOCK_COMMENT("Entry:");
2008    }
2009
2010    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2011                      // r9 and r10 may be used to save non-volatile registers
2012    // 'from', 'to' and 'qword_count' are now valid
2013    if (is_oop) {
2014      // no registers are destroyed by this call
2015      gen_write_ref_array_pre_barrier(to, qword_count, dest_uninitialized);
2016    }
2017
2018    // Copy from low to high addresses.  Use 'to' as scratch.
2019    __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2020    __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2021    __ negptr(qword_count);
2022    __ jmp(L_copy_32_bytes);
2023
2024    // Copy trailing qwords
2025  __ BIND(L_copy_8_bytes);
2026    __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2027    __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2028    __ increment(qword_count);
2029    __ jcc(Assembler::notZero, L_copy_8_bytes);
2030
2031    if (is_oop) {
2032      __ jmp(L_exit);
2033    } else {
2034      restore_arg_regs();
2035      inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2036      __ xorptr(rax, rax); // return 0
2037      __ leave(); // required for proper stackwalking of RuntimeStub frame
2038      __ ret(0);
2039    }
2040
2041    // Copy 64-byte chunks
2042    copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
2043
2044    if (is_oop) {
2045    __ BIND(L_exit);
2046      gen_write_ref_array_post_barrier(saved_to, end_to, rax);
2047    }
2048    restore_arg_regs();
2049    if (is_oop) {
2050      inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2051    } else {
2052      inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2053    }
2054    __ xorptr(rax, rax); // return 0
2055    __ leave(); // required for proper stackwalking of RuntimeStub frame
2056    __ ret(0);
2057
2058    return start;
2059  }
2060
2061  // Arguments:
2062  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2063  //             ignored
2064  //   is_oop  - true => oop array, so generate store check code
2065  //   name    - stub name string
2066  //
2067  // Inputs:
2068  //   c_rarg0   - source array address
2069  //   c_rarg1   - destination array address
2070  //   c_rarg2   - element count, treated as ssize_t, can be zero
2071  //
2072  address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2073                                          address nooverlap_target, address *entry,
2074                                          const char *name, bool dest_uninitialized = false) {
2075    __ align(CodeEntryAlignment);
2076    StubCodeMark mark(this, "StubRoutines", name);
2077    address start = __ pc();
2078
2079    Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
2080    const Register from        = rdi;  // source array address
2081    const Register to          = rsi;  // destination array address
2082    const Register qword_count = rdx;  // elements count
2083    const Register saved_count = rcx;
2084
2085    __ enter(); // required for proper stackwalking of RuntimeStub frame
2086    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2087
2088    if (entry != NULL) {
2089      *entry = __ pc();
2090      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2091      BLOCK_COMMENT("Entry:");
2092    }
2093
2094    array_overlap_test(nooverlap_target, Address::times_8);
2095    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2096                      // r9 and r10 may be used to save non-volatile registers
2097    // 'from', 'to' and 'qword_count' are now valid
2098    if (is_oop) {
2099      // Save to and count for store barrier
2100      __ movptr(saved_count, qword_count);
2101      // No registers are destroyed by this call
2102      gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
2103    }
2104
2105    __ jmp(L_copy_32_bytes);
2106
2107    // Copy trailing qwords
2108  __ BIND(L_copy_8_bytes);
2109    __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2110    __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2111    __ decrement(qword_count);
2112    __ jcc(Assembler::notZero, L_copy_8_bytes);
2113
2114    if (is_oop) {
2115      __ jmp(L_exit);
2116    } else {
2117      restore_arg_regs();
2118      inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2119      __ xorptr(rax, rax); // return 0
2120      __ leave(); // required for proper stackwalking of RuntimeStub frame
2121      __ ret(0);
2122    }
2123
2124    // Copy in 32-bytes chunks
2125    copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
2126
2127    if (is_oop) {
2128    __ BIND(L_exit);
2129      __ lea(rcx, Address(to, saved_count, Address::times_8, -8));
2130      gen_write_ref_array_post_barrier(to, rcx, rax);
2131    }
2132    restore_arg_regs();
2133    if (is_oop) {
2134      inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2135    } else {
2136      inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2137    }
2138    __ xorptr(rax, rax); // return 0
2139    __ leave(); // required for proper stackwalking of RuntimeStub frame
2140    __ ret(0);
2141
2142    return start;
2143  }
2144
2145
2146  // Helper for generating a dynamic type check.
2147  // Smashes no registers.
2148  void generate_type_check(Register sub_klass,
2149                           Register super_check_offset,
2150                           Register super_klass,
2151                           Label& L_success) {
2152    assert_different_registers(sub_klass, super_check_offset, super_klass);
2153
2154    BLOCK_COMMENT("type_check:");
2155
2156    Label L_miss;
2157
2158    __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2159                                     super_check_offset);
2160    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2161
2162    // Fall through on failure!
2163    __ BIND(L_miss);
2164  }
2165
2166  //
2167  //  Generate checkcasting array copy stub
2168  //
2169  //  Input:
2170  //    c_rarg0   - source array address
2171  //    c_rarg1   - destination array address
2172  //    c_rarg2   - element count, treated as ssize_t, can be zero
2173  //    c_rarg3   - size_t ckoff (super_check_offset)
2174  // not Win64
2175  //    c_rarg4   - oop ckval (super_klass)
2176  // Win64
2177  //    rsp+40    - oop ckval (super_klass)
2178  //
2179  //  Output:
2180  //    rax ==  0  -  success
2181  //    rax == -1^K - failure, where K is partial transfer count
2182  //
2183  address generate_checkcast_copy(const char *name, address *entry,
2184                                  bool dest_uninitialized = false) {
2185
2186    Label L_load_element, L_store_element, L_do_card_marks, L_done;
2187
2188    // Input registers (after setup_arg_regs)
2189    const Register from        = rdi;   // source array address
2190    const Register to          = rsi;   // destination array address
2191    const Register length      = rdx;   // elements count
2192    const Register ckoff       = rcx;   // super_check_offset
2193    const Register ckval       = r8;    // super_klass
2194
2195    // Registers used as temps (r13, r14 are save-on-entry)
2196    const Register end_from    = from;  // source array end address
2197    const Register end_to      = r13;   // destination array end address
2198    const Register count       = rdx;   // -(count_remaining)
2199    const Register r14_length  = r14;   // saved copy of length
2200    // End pointers are inclusive, and if length is not zero they point
2201    // to the last unit copied:  end_to[0] := end_from[0]
2202
2203    const Register rax_oop    = rax;    // actual oop copied
2204    const Register r11_klass  = r11;    // oop._klass
2205
2206    //---------------------------------------------------------------
2207    // Assembler stub will be used for this call to arraycopy
2208    // if the two arrays are subtypes of Object[] but the
2209    // destination array type is not equal to or a supertype
2210    // of the source type.  Each element must be separately
2211    // checked.
2212
2213    __ align(CodeEntryAlignment);
2214    StubCodeMark mark(this, "StubRoutines", name);
2215    address start = __ pc();
2216
2217    __ enter(); // required for proper stackwalking of RuntimeStub frame
2218
2219#ifdef ASSERT
2220    // caller guarantees that the arrays really are different
2221    // otherwise, we would have to make conjoint checks
2222    { Label L;
2223      array_overlap_test(L, TIMES_OOP);
2224      __ stop("checkcast_copy within a single array");
2225      __ bind(L);
2226    }
2227#endif //ASSERT
2228
2229    setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2230                       // ckoff => rcx, ckval => r8
2231                       // r9 and r10 may be used to save non-volatile registers
2232#ifdef _WIN64
2233    // last argument (#4) is on stack on Win64
2234    __ movptr(ckval, Address(rsp, 6 * wordSize));
2235#endif
2236
2237    // Caller of this entry point must set up the argument registers.
2238    if (entry != NULL) {
2239      *entry = __ pc();
2240      BLOCK_COMMENT("Entry:");
2241    }
2242
2243    // allocate spill slots for r13, r14
2244    enum {
2245      saved_r13_offset,
2246      saved_r14_offset,
2247      saved_rbp_offset
2248    };
2249    __ subptr(rsp, saved_rbp_offset * wordSize);
2250    __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2251    __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2252
2253    // check that int operands are properly extended to size_t
2254    assert_clean_int(length, rax);
2255    assert_clean_int(ckoff, rax);
2256
2257#ifdef ASSERT
2258    BLOCK_COMMENT("assert consistent ckoff/ckval");
2259    // The ckoff and ckval must be mutually consistent,
2260    // even though caller generates both.
2261    { Label L;
2262      int sco_offset = in_bytes(Klass::super_check_offset_offset());
2263      __ cmpl(ckoff, Address(ckval, sco_offset));
2264      __ jcc(Assembler::equal, L);
2265      __ stop("super_check_offset inconsistent");
2266      __ bind(L);
2267    }
2268#endif //ASSERT
2269
2270    // Loop-invariant addresses.  They are exclusive end pointers.
2271    Address end_from_addr(from, length, TIMES_OOP, 0);
2272    Address   end_to_addr(to,   length, TIMES_OOP, 0);
2273    // Loop-variant addresses.  They assume post-incremented count < 0.
2274    Address from_element_addr(end_from, count, TIMES_OOP, 0);
2275    Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2276
2277    gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
2278
2279    // Copy from low to high addresses, indexed from the end of each array.
2280    __ lea(end_from, end_from_addr);
2281    __ lea(end_to,   end_to_addr);
2282    __ movptr(r14_length, length);        // save a copy of the length
2283    assert(length == count, "");          // else fix next line:
2284    __ negptr(count);                     // negate and test the length
2285    __ jcc(Assembler::notZero, L_load_element);
2286
2287    // Empty array:  Nothing to do.
2288    __ xorptr(rax, rax);                  // return 0 on (trivial) success
2289    __ jmp(L_done);
2290
2291    // ======== begin loop ========
2292    // (Loop is rotated; its entry is L_load_element.)
2293    // Loop control:
2294    //   for (count = -count; count != 0; count++)
2295    // Base pointers src, dst are biased by 8*(count-1),to last element.
2296    __ align(OptoLoopAlignment);
2297
2298    __ BIND(L_store_element);
2299    __ store_heap_oop(to_element_addr, rax_oop);  // store the oop
2300    __ increment(count);               // increment the count toward zero
2301    __ jcc(Assembler::zero, L_do_card_marks);
2302
2303    // ======== loop entry is here ========
2304    __ BIND(L_load_element);
2305    __ load_heap_oop(rax_oop, from_element_addr); // load the oop
2306    __ testptr(rax_oop, rax_oop);
2307    __ jcc(Assembler::zero, L_store_element);
2308
2309    __ load_klass(r11_klass, rax_oop);// query the object klass
2310    generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2311    // ======== end loop ========
2312
2313    // It was a real error; we must depend on the caller to finish the job.
2314    // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2315    // Emit GC store barriers for the oops we have copied (r14 + rdx),
2316    // and report their number to the caller.
2317    assert_different_registers(rax, r14_length, count, to, end_to, rcx);
2318    __ lea(end_to, to_element_addr);
2319    __ addptr(end_to, -heapOopSize);      // make an inclusive end pointer
2320    gen_write_ref_array_post_barrier(to, end_to, rscratch1);
2321    __ movptr(rax, r14_length);           // original oops
2322    __ addptr(rax, count);                // K = (original - remaining) oops
2323    __ notptr(rax);                       // report (-1^K) to caller
2324    __ jmp(L_done);
2325
2326    // Come here on success only.
2327    __ BIND(L_do_card_marks);
2328    __ addptr(end_to, -heapOopSize);         // make an inclusive end pointer
2329    gen_write_ref_array_post_barrier(to, end_to, rscratch1);
2330    __ xorptr(rax, rax);                  // return 0 on success
2331
2332    // Common exit point (success or failure).
2333    __ BIND(L_done);
2334    __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2335    __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2336    restore_arg_regs();
2337    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2338    __ leave(); // required for proper stackwalking of RuntimeStub frame
2339    __ ret(0);
2340
2341    return start;
2342  }
2343
2344  //
2345  //  Generate 'unsafe' array copy stub
2346  //  Though just as safe as the other stubs, it takes an unscaled
2347  //  size_t argument instead of an element count.
2348  //
2349  //  Input:
2350  //    c_rarg0   - source array address
2351  //    c_rarg1   - destination array address
2352  //    c_rarg2   - byte count, treated as ssize_t, can be zero
2353  //
2354  // Examines the alignment of the operands and dispatches
2355  // to a long, int, short, or byte copy loop.
2356  //
2357  address generate_unsafe_copy(const char *name,
2358                               address byte_copy_entry, address short_copy_entry,
2359                               address int_copy_entry, address long_copy_entry) {
2360
2361    Label L_long_aligned, L_int_aligned, L_short_aligned;
2362
2363    // Input registers (before setup_arg_regs)
2364    const Register from        = c_rarg0;  // source array address
2365    const Register to          = c_rarg1;  // destination array address
2366    const Register size        = c_rarg2;  // byte count (size_t)
2367
2368    // Register used as a temp
2369    const Register bits        = rax;      // test copy of low bits
2370
2371    __ align(CodeEntryAlignment);
2372    StubCodeMark mark(this, "StubRoutines", name);
2373    address start = __ pc();
2374
2375    __ enter(); // required for proper stackwalking of RuntimeStub frame
2376
2377    // bump this on entry, not on exit:
2378    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2379
2380    __ mov(bits, from);
2381    __ orptr(bits, to);
2382    __ orptr(bits, size);
2383
2384    __ testb(bits, BytesPerLong-1);
2385    __ jccb(Assembler::zero, L_long_aligned);
2386
2387    __ testb(bits, BytesPerInt-1);
2388    __ jccb(Assembler::zero, L_int_aligned);
2389
2390    __ testb(bits, BytesPerShort-1);
2391    __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2392
2393    __ BIND(L_short_aligned);
2394    __ shrptr(size, LogBytesPerShort); // size => short_count
2395    __ jump(RuntimeAddress(short_copy_entry));
2396
2397    __ BIND(L_int_aligned);
2398    __ shrptr(size, LogBytesPerInt); // size => int_count
2399    __ jump(RuntimeAddress(int_copy_entry));
2400
2401    __ BIND(L_long_aligned);
2402    __ shrptr(size, LogBytesPerLong); // size => qword_count
2403    __ jump(RuntimeAddress(long_copy_entry));
2404
2405    return start;
2406  }
2407
2408  // Perform range checks on the proposed arraycopy.
2409  // Kills temp, but nothing else.
2410  // Also, clean the sign bits of src_pos and dst_pos.
2411  void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2412                              Register src_pos, // source position (c_rarg1)
2413                              Register dst,     // destination array oo (c_rarg2)
2414                              Register dst_pos, // destination position (c_rarg3)
2415                              Register length,
2416                              Register temp,
2417                              Label& L_failed) {
2418    BLOCK_COMMENT("arraycopy_range_checks:");
2419
2420    //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2421    __ movl(temp, length);
2422    __ addl(temp, src_pos);             // src_pos + length
2423    __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2424    __ jcc(Assembler::above, L_failed);
2425
2426    //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2427    __ movl(temp, length);
2428    __ addl(temp, dst_pos);             // dst_pos + length
2429    __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2430    __ jcc(Assembler::above, L_failed);
2431
2432    // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2433    // Move with sign extension can be used since they are positive.
2434    __ movslq(src_pos, src_pos);
2435    __ movslq(dst_pos, dst_pos);
2436
2437    BLOCK_COMMENT("arraycopy_range_checks done");
2438  }
2439
2440  //
2441  //  Generate generic array copy stubs
2442  //
2443  //  Input:
2444  //    c_rarg0    -  src oop
2445  //    c_rarg1    -  src_pos (32-bits)
2446  //    c_rarg2    -  dst oop
2447  //    c_rarg3    -  dst_pos (32-bits)
2448  // not Win64
2449  //    c_rarg4    -  element count (32-bits)
2450  // Win64
2451  //    rsp+40     -  element count (32-bits)
2452  //
2453  //  Output:
2454  //    rax ==  0  -  success
2455  //    rax == -1^K - failure, where K is partial transfer count
2456  //
2457  address generate_generic_copy(const char *name,
2458                                address byte_copy_entry, address short_copy_entry,
2459                                address int_copy_entry, address oop_copy_entry,
2460                                address long_copy_entry, address checkcast_copy_entry) {
2461
2462    Label L_failed, L_failed_0, L_objArray;
2463    Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2464
2465    // Input registers
2466    const Register src        = c_rarg0;  // source array oop
2467    const Register src_pos    = c_rarg1;  // source position
2468    const Register dst        = c_rarg2;  // destination array oop
2469    const Register dst_pos    = c_rarg3;  // destination position
2470#ifndef _WIN64
2471    const Register length     = c_rarg4;
2472#else
2473    const Address  length(rsp, 6 * wordSize);  // elements count is on stack on Win64
2474#endif
2475
2476    { int modulus = CodeEntryAlignment;
2477      int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2478      int advance = target - (__ offset() % modulus);
2479      if (advance < 0)  advance += modulus;
2480      if (advance > 0)  __ nop(advance);
2481    }
2482    StubCodeMark mark(this, "StubRoutines", name);
2483
2484    // Short-hop target to L_failed.  Makes for denser prologue code.
2485    __ BIND(L_failed_0);
2486    __ jmp(L_failed);
2487    assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2488
2489    __ align(CodeEntryAlignment);
2490    address start = __ pc();
2491
2492    __ enter(); // required for proper stackwalking of RuntimeStub frame
2493
2494    // bump this on entry, not on exit:
2495    inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2496
2497    //-----------------------------------------------------------------------
2498    // Assembler stub will be used for this call to arraycopy
2499    // if the following conditions are met:
2500    //
2501    // (1) src and dst must not be null.
2502    // (2) src_pos must not be negative.
2503    // (3) dst_pos must not be negative.
2504    // (4) length  must not be negative.
2505    // (5) src klass and dst klass should be the same and not NULL.
2506    // (6) src and dst should be arrays.
2507    // (7) src_pos + length must not exceed length of src.
2508    // (8) dst_pos + length must not exceed length of dst.
2509    //
2510
2511    //  if (src == NULL) return -1;
2512    __ testptr(src, src);         // src oop
2513    size_t j1off = __ offset();
2514    __ jccb(Assembler::zero, L_failed_0);
2515
2516    //  if (src_pos < 0) return -1;
2517    __ testl(src_pos, src_pos); // src_pos (32-bits)
2518    __ jccb(Assembler::negative, L_failed_0);
2519
2520    //  if (dst == NULL) return -1;
2521    __ testptr(dst, dst);         // dst oop
2522    __ jccb(Assembler::zero, L_failed_0);
2523
2524    //  if (dst_pos < 0) return -1;
2525    __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2526    size_t j4off = __ offset();
2527    __ jccb(Assembler::negative, L_failed_0);
2528
2529    // The first four tests are very dense code,
2530    // but not quite dense enough to put four
2531    // jumps in a 16-byte instruction fetch buffer.
2532    // That's good, because some branch predicters
2533    // do not like jumps so close together.
2534    // Make sure of this.
2535    guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2536
2537    // registers used as temp
2538    const Register r11_length    = r11; // elements count to copy
2539    const Register r10_src_klass = r10; // array klass
2540
2541    //  if (length < 0) return -1;
2542    __ movl(r11_length, length);        // length (elements count, 32-bits value)
2543    __ testl(r11_length, r11_length);
2544    __ jccb(Assembler::negative, L_failed_0);
2545
2546    __ load_klass(r10_src_klass, src);
2547#ifdef ASSERT
2548    //  assert(src->klass() != NULL);
2549    {
2550      BLOCK_COMMENT("assert klasses not null {");
2551      Label L1, L2;
2552      __ testptr(r10_src_klass, r10_src_klass);
2553      __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
2554      __ bind(L1);
2555      __ stop("broken null klass");
2556      __ bind(L2);
2557      __ load_klass(rax, dst);
2558      __ cmpq(rax, 0);
2559      __ jcc(Assembler::equal, L1);     // this would be broken also
2560      BLOCK_COMMENT("} assert klasses not null done");
2561    }
2562#endif
2563
2564    // Load layout helper (32-bits)
2565    //
2566    //  |array_tag|     | header_size | element_type |     |log2_element_size|
2567    // 32        30    24            16              8     2                 0
2568    //
2569    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2570    //
2571
2572    const int lh_offset = in_bytes(Klass::layout_helper_offset());
2573
2574    // Handle objArrays completely differently...
2575    const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2576    __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2577    __ jcc(Assembler::equal, L_objArray);
2578
2579    //  if (src->klass() != dst->klass()) return -1;
2580    __ load_klass(rax, dst);
2581    __ cmpq(r10_src_klass, rax);
2582    __ jcc(Assembler::notEqual, L_failed);
2583
2584    const Register rax_lh = rax;  // layout helper
2585    __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2586
2587    //  if (!src->is_Array()) return -1;
2588    __ cmpl(rax_lh, Klass::_lh_neutral_value);
2589    __ jcc(Assembler::greaterEqual, L_failed);
2590
2591    // At this point, it is known to be a typeArray (array_tag 0x3).
2592#ifdef ASSERT
2593    {
2594      BLOCK_COMMENT("assert primitive array {");
2595      Label L;
2596      __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2597      __ jcc(Assembler::greaterEqual, L);
2598      __ stop("must be a primitive array");
2599      __ bind(L);
2600      BLOCK_COMMENT("} assert primitive array done");
2601    }
2602#endif
2603
2604    arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2605                           r10, L_failed);
2606
2607    // TypeArrayKlass
2608    //
2609    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2610    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2611    //
2612
2613    const Register r10_offset = r10;    // array offset
2614    const Register rax_elsize = rax_lh; // element size
2615
2616    __ movl(r10_offset, rax_lh);
2617    __ shrl(r10_offset, Klass::_lh_header_size_shift);
2618    __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
2619    __ addptr(src, r10_offset);           // src array offset
2620    __ addptr(dst, r10_offset);           // dst array offset
2621    BLOCK_COMMENT("choose copy loop based on element size");
2622    __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2623
2624    // next registers should be set before the jump to corresponding stub
2625    const Register from     = c_rarg0;  // source array address
2626    const Register to       = c_rarg1;  // destination array address
2627    const Register count    = c_rarg2;  // elements count
2628
2629    // 'from', 'to', 'count' registers should be set in such order
2630    // since they are the same as 'src', 'src_pos', 'dst'.
2631
2632  __ BIND(L_copy_bytes);
2633    __ cmpl(rax_elsize, 0);
2634    __ jccb(Assembler::notEqual, L_copy_shorts);
2635    __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2636    __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2637    __ movl2ptr(count, r11_length); // length
2638    __ jump(RuntimeAddress(byte_copy_entry));
2639
2640  __ BIND(L_copy_shorts);
2641    __ cmpl(rax_elsize, LogBytesPerShort);
2642    __ jccb(Assembler::notEqual, L_copy_ints);
2643    __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2644    __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2645    __ movl2ptr(count, r11_length); // length
2646    __ jump(RuntimeAddress(short_copy_entry));
2647
2648  __ BIND(L_copy_ints);
2649    __ cmpl(rax_elsize, LogBytesPerInt);
2650    __ jccb(Assembler::notEqual, L_copy_longs);
2651    __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2652    __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2653    __ movl2ptr(count, r11_length); // length
2654    __ jump(RuntimeAddress(int_copy_entry));
2655
2656  __ BIND(L_copy_longs);
2657#ifdef ASSERT
2658    {
2659      BLOCK_COMMENT("assert long copy {");
2660      Label L;
2661      __ cmpl(rax_elsize, LogBytesPerLong);
2662      __ jcc(Assembler::equal, L);
2663      __ stop("must be long copy, but elsize is wrong");
2664      __ bind(L);
2665      BLOCK_COMMENT("} assert long copy done");
2666    }
2667#endif
2668    __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2669    __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2670    __ movl2ptr(count, r11_length); // length
2671    __ jump(RuntimeAddress(long_copy_entry));
2672
2673    // ObjArrayKlass
2674  __ BIND(L_objArray);
2675    // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
2676
2677    Label L_plain_copy, L_checkcast_copy;
2678    //  test array classes for subtyping
2679    __ load_klass(rax, dst);
2680    __ cmpq(r10_src_klass, rax); // usual case is exact equality
2681    __ jcc(Assembler::notEqual, L_checkcast_copy);
2682
2683    // Identically typed arrays can be copied without element-wise checks.
2684    arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2685                           r10, L_failed);
2686
2687    __ lea(from, Address(src, src_pos, TIMES_OOP,
2688                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2689    __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2690                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2691    __ movl2ptr(count, r11_length); // length
2692  __ BIND(L_plain_copy);
2693    __ jump(RuntimeAddress(oop_copy_entry));
2694
2695  __ BIND(L_checkcast_copy);
2696    // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
2697    {
2698      // Before looking at dst.length, make sure dst is also an objArray.
2699      __ cmpl(Address(rax, lh_offset), objArray_lh);
2700      __ jcc(Assembler::notEqual, L_failed);
2701
2702      // It is safe to examine both src.length and dst.length.
2703      arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2704                             rax, L_failed);
2705
2706      const Register r11_dst_klass = r11;
2707      __ load_klass(r11_dst_klass, dst); // reload
2708
2709      // Marshal the base address arguments now, freeing registers.
2710      __ lea(from, Address(src, src_pos, TIMES_OOP,
2711                   arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2712      __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2713                   arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2714      __ movl(count, length);           // length (reloaded)
2715      Register sco_temp = c_rarg3;      // this register is free now
2716      assert_different_registers(from, to, count, sco_temp,
2717                                 r11_dst_klass, r10_src_klass);
2718      assert_clean_int(count, sco_temp);
2719
2720      // Generate the type check.
2721      const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2722      __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2723      assert_clean_int(sco_temp, rax);
2724      generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
2725
2726      // Fetch destination element klass from the ObjArrayKlass header.
2727      int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2728      __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
2729      __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
2730      assert_clean_int(sco_temp, rax);
2731
2732      // the checkcast_copy loop needs two extra arguments:
2733      assert(c_rarg3 == sco_temp, "#3 already in place");
2734      // Set up arguments for checkcast_copy_entry.
2735      setup_arg_regs(4);
2736      __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
2737      __ jump(RuntimeAddress(checkcast_copy_entry));
2738    }
2739
2740  __ BIND(L_failed);
2741    __ xorptr(rax, rax);
2742    __ notptr(rax); // return -1
2743    __ leave();   // required for proper stackwalking of RuntimeStub frame
2744    __ ret(0);
2745
2746    return start;
2747  }
2748
2749  void generate_arraycopy_stubs() {
2750    address entry;
2751    address entry_jbyte_arraycopy;
2752    address entry_jshort_arraycopy;
2753    address entry_jint_arraycopy;
2754    address entry_oop_arraycopy;
2755    address entry_jlong_arraycopy;
2756    address entry_checkcast_arraycopy;
2757
2758    StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
2759                                                                           "jbyte_disjoint_arraycopy");
2760    StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
2761                                                                           "jbyte_arraycopy");
2762
2763    StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
2764                                                                            "jshort_disjoint_arraycopy");
2765    StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
2766                                                                            "jshort_arraycopy");
2767
2768    StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
2769                                                                              "jint_disjoint_arraycopy");
2770    StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
2771                                                                              &entry_jint_arraycopy, "jint_arraycopy");
2772
2773    StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
2774                                                                               "jlong_disjoint_arraycopy");
2775    StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
2776                                                                               &entry_jlong_arraycopy, "jlong_arraycopy");
2777
2778
2779    if (UseCompressedOops) {
2780      StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
2781                                                                              "oop_disjoint_arraycopy");
2782      StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
2783                                                                              &entry_oop_arraycopy, "oop_arraycopy");
2784      StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
2785                                                                                     "oop_disjoint_arraycopy_uninit",
2786                                                                                     /*dest_uninitialized*/true);
2787      StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
2788                                                                                     NULL, "oop_arraycopy_uninit",
2789                                                                                     /*dest_uninitialized*/true);
2790    } else {
2791      StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
2792                                                                               "oop_disjoint_arraycopy");
2793      StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
2794                                                                               &entry_oop_arraycopy, "oop_arraycopy");
2795      StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
2796                                                                                      "oop_disjoint_arraycopy_uninit",
2797                                                                                      /*dest_uninitialized*/true);
2798      StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
2799                                                                                      NULL, "oop_arraycopy_uninit",
2800                                                                                      /*dest_uninitialized*/true);
2801    }
2802
2803    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2804    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2805                                                                        /*dest_uninitialized*/true);
2806
2807    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2808                                                              entry_jbyte_arraycopy,
2809                                                              entry_jshort_arraycopy,
2810                                                              entry_jint_arraycopy,
2811                                                              entry_jlong_arraycopy);
2812    StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2813                                                               entry_jbyte_arraycopy,
2814                                                               entry_jshort_arraycopy,
2815                                                               entry_jint_arraycopy,
2816                                                               entry_oop_arraycopy,
2817                                                               entry_jlong_arraycopy,
2818                                                               entry_checkcast_arraycopy);
2819
2820    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2821    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2822    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2823    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2824    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2825    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2826
2827    // We don't generate specialized code for HeapWord-aligned source
2828    // arrays, so just use the code we've already generated
2829    StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
2830    StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
2831
2832    StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
2833    StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
2834
2835    StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
2836    StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
2837
2838    StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
2839    StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
2840
2841    StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
2842    StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
2843
2844    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
2845    StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
2846  }
2847
2848  void generate_math_stubs() {
2849    {
2850      StubCodeMark mark(this, "StubRoutines", "log");
2851      StubRoutines::_intrinsic_log = (double (*)(double)) __ pc();
2852
2853      __ subq(rsp, 8);
2854      __ movdbl(Address(rsp, 0), xmm0);
2855      __ fld_d(Address(rsp, 0));
2856      __ flog();
2857      __ fstp_d(Address(rsp, 0));
2858      __ movdbl(xmm0, Address(rsp, 0));
2859      __ addq(rsp, 8);
2860      __ ret(0);
2861    }
2862    {
2863      StubCodeMark mark(this, "StubRoutines", "log10");
2864      StubRoutines::_intrinsic_log10 = (double (*)(double)) __ pc();
2865
2866      __ subq(rsp, 8);
2867      __ movdbl(Address(rsp, 0), xmm0);
2868      __ fld_d(Address(rsp, 0));
2869      __ flog10();
2870      __ fstp_d(Address(rsp, 0));
2871      __ movdbl(xmm0, Address(rsp, 0));
2872      __ addq(rsp, 8);
2873      __ ret(0);
2874    }
2875    {
2876      StubCodeMark mark(this, "StubRoutines", "sin");
2877      StubRoutines::_intrinsic_sin = (double (*)(double)) __ pc();
2878
2879      __ subq(rsp, 8);
2880      __ movdbl(Address(rsp, 0), xmm0);
2881      __ fld_d(Address(rsp, 0));
2882      __ trigfunc('s');
2883      __ fstp_d(Address(rsp, 0));
2884      __ movdbl(xmm0, Address(rsp, 0));
2885      __ addq(rsp, 8);
2886      __ ret(0);
2887    }
2888    {
2889      StubCodeMark mark(this, "StubRoutines", "cos");
2890      StubRoutines::_intrinsic_cos = (double (*)(double)) __ pc();
2891
2892      __ subq(rsp, 8);
2893      __ movdbl(Address(rsp, 0), xmm0);
2894      __ fld_d(Address(rsp, 0));
2895      __ trigfunc('c');
2896      __ fstp_d(Address(rsp, 0));
2897      __ movdbl(xmm0, Address(rsp, 0));
2898      __ addq(rsp, 8);
2899      __ ret(0);
2900    }
2901    {
2902      StubCodeMark mark(this, "StubRoutines", "tan");
2903      StubRoutines::_intrinsic_tan = (double (*)(double)) __ pc();
2904
2905      __ subq(rsp, 8);
2906      __ movdbl(Address(rsp, 0), xmm0);
2907      __ fld_d(Address(rsp, 0));
2908      __ trigfunc('t');
2909      __ fstp_d(Address(rsp, 0));
2910      __ movdbl(xmm0, Address(rsp, 0));
2911      __ addq(rsp, 8);
2912      __ ret(0);
2913    }
2914    {
2915      StubCodeMark mark(this, "StubRoutines", "exp");
2916      StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc();
2917
2918      __ subq(rsp, 8);
2919      __ movdbl(Address(rsp, 0), xmm0);
2920      __ fld_d(Address(rsp, 0));
2921      __ exp_with_fallback(0);
2922      __ fstp_d(Address(rsp, 0));
2923      __ movdbl(xmm0, Address(rsp, 0));
2924      __ addq(rsp, 8);
2925      __ ret(0);
2926    }
2927    {
2928      StubCodeMark mark(this, "StubRoutines", "pow");
2929      StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
2930
2931      __ subq(rsp, 8);
2932      __ movdbl(Address(rsp, 0), xmm1);
2933      __ fld_d(Address(rsp, 0));
2934      __ movdbl(Address(rsp, 0), xmm0);
2935      __ fld_d(Address(rsp, 0));
2936      __ pow_with_fallback(0);
2937      __ fstp_d(Address(rsp, 0));
2938      __ movdbl(xmm0, Address(rsp, 0));
2939      __ addq(rsp, 8);
2940      __ ret(0);
2941    }
2942  }
2943
2944  // AES intrinsic stubs
2945  enum {AESBlockSize = 16};
2946
2947  address generate_key_shuffle_mask() {
2948    __ align(16);
2949    StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2950    address start = __ pc();
2951    __ emit_data64( 0x0405060700010203, relocInfo::none );
2952    __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
2953    return start;
2954  }
2955
2956  // Utility routine for loading a 128-bit key word in little endian format
2957  // can optionally specify that the shuffle mask is already in an xmmregister
2958  void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2959    __ movdqu(xmmdst, Address(key, offset));
2960    if (xmm_shuf_mask != NULL) {
2961      __ pshufb(xmmdst, xmm_shuf_mask);
2962    } else {
2963      __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2964    }
2965  }
2966
2967  // aesenc using specified key+offset
2968  // can optionally specify that the shuffle mask is already in an xmmregister
2969  void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2970    load_key(xmmtmp, key, offset, xmm_shuf_mask);
2971    __ aesenc(xmmdst, xmmtmp);
2972  }
2973
2974  // aesdec using specified key+offset
2975  // can optionally specify that the shuffle mask is already in an xmmregister
2976  void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2977    load_key(xmmtmp, key, offset, xmm_shuf_mask);
2978    __ aesdec(xmmdst, xmmtmp);
2979  }
2980
2981
2982  // Arguments:
2983  //
2984  // Inputs:
2985  //   c_rarg0   - source byte array address
2986  //   c_rarg1   - destination byte array address
2987  //   c_rarg2   - K (key) in little endian int array
2988  //
2989  address generate_aescrypt_encryptBlock() {
2990    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
2991    __ align(CodeEntryAlignment);
2992    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2993    Label L_doLast;
2994    address start = __ pc();
2995
2996    const Register from        = c_rarg0;  // source array address
2997    const Register to          = c_rarg1;  // destination array address
2998    const Register key         = c_rarg2;  // key array address
2999    const Register keylen      = rax;
3000
3001    const XMMRegister xmm_result = xmm0;
3002    const XMMRegister xmm_temp   = xmm1;
3003    const XMMRegister xmm_key_shuf_mask = xmm2;
3004
3005    __ enter(); // required for proper stackwalking of RuntimeStub frame
3006
3007    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3008    // keylen = # of 32-bit words, convert to 128-bit words
3009    __ shrl(keylen, 2);
3010    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
3011
3012    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3013    __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3014
3015    // For encryption, the java expanded key ordering is just what we need
3016    // we don't know if the key is aligned, hence not using load-execute form
3017
3018    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
3019    __ pxor(xmm_result, xmm_temp);
3020    for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
3021      aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
3022    }
3023    load_key  (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
3024    __ cmpl(keylen, 0);
3025    __ jcc(Assembler::equal, L_doLast);
3026    __ aesenc(xmm_result, xmm_temp);                   // only in 192 and 256 bit keys
3027    aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
3028    load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
3029    __ subl(keylen, 2);
3030    __ jcc(Assembler::equal, L_doLast);
3031    __ aesenc(xmm_result, xmm_temp);                   // only in 256 bit keys
3032    aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
3033    load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
3034
3035    __ BIND(L_doLast);
3036    __ aesenclast(xmm_result, xmm_temp);
3037    __ movdqu(Address(to, 0), xmm_result);        // store the result
3038    __ xorptr(rax, rax); // return 0
3039    __ leave(); // required for proper stackwalking of RuntimeStub frame
3040    __ ret(0);
3041
3042    return start;
3043  }
3044
3045
3046  // Arguments:
3047  //
3048  // Inputs:
3049  //   c_rarg0   - source byte array address
3050  //   c_rarg1   - destination byte array address
3051  //   c_rarg2   - K (key) in little endian int array
3052  //
3053  address generate_aescrypt_decryptBlock() {
3054    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
3055    __ align(CodeEntryAlignment);
3056    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3057    Label L_doLast;
3058    address start = __ pc();
3059
3060    const Register from        = c_rarg0;  // source array address
3061    const Register to          = c_rarg1;  // destination array address
3062    const Register key         = c_rarg2;  // key array address
3063    const Register keylen      = rax;
3064
3065    const XMMRegister xmm_result = xmm0;
3066    const XMMRegister xmm_temp   = xmm1;
3067    const XMMRegister xmm_key_shuf_mask = xmm2;
3068
3069    __ enter(); // required for proper stackwalking of RuntimeStub frame
3070
3071    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3072    // keylen = # of 32-bit words, convert to 128-bit words
3073    __ shrl(keylen, 2);
3074    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
3075
3076    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3077    __ movdqu(xmm_result, Address(from, 0));
3078
3079    // for decryption java expanded key ordering is rotated one position from what we want
3080    // so we start from 0x10 here and hit 0x00 last
3081    // we don't know if the key is aligned, hence not using load-execute form
3082    load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
3083    __ pxor  (xmm_result, xmm_temp);
3084    for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
3085      aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
3086    }
3087    __ cmpl(keylen, 0);
3088    __ jcc(Assembler::equal, L_doLast);
3089    // only in 192 and 256 bit keys
3090    aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
3091    aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
3092    __ subl(keylen, 2);
3093    __ jcc(Assembler::equal, L_doLast);
3094    // only in 256 bit keys
3095    aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
3096    aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
3097
3098    __ BIND(L_doLast);
3099    // for decryption the aesdeclast operation is always on key+0x00
3100    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
3101    __ aesdeclast(xmm_result, xmm_temp);
3102
3103    __ movdqu(Address(to, 0), xmm_result);  // store the result
3104
3105    __ xorptr(rax, rax); // return 0
3106    __ leave(); // required for proper stackwalking of RuntimeStub frame
3107    __ ret(0);
3108
3109    return start;
3110  }
3111
3112
3113  // Arguments:
3114  //
3115  // Inputs:
3116  //   c_rarg0   - source byte array address
3117  //   c_rarg1   - destination byte array address
3118  //   c_rarg2   - K (key) in little endian int array
3119  //   c_rarg3   - r vector byte array address
3120  //   c_rarg4   - input length
3121  //
3122  address generate_cipherBlockChaining_encryptAESCrypt() {
3123    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
3124    __ align(CodeEntryAlignment);
3125    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3126    address start = __ pc();
3127
3128    Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3129    const Register from        = c_rarg0;  // source array address
3130    const Register to          = c_rarg1;  // destination array address
3131    const Register key         = c_rarg2;  // key array address
3132    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3133                                           // and left with the results of the last encryption block
3134#ifndef _WIN64
3135    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3136#else
3137    const Address  len_mem(rsp, 6 * wordSize);  // length is on stack on Win64
3138    const Register len_reg     = r10;      // pick the first volatile windows register
3139#endif
3140    const Register pos         = rax;
3141
3142    // xmm register assignments for the loops below
3143    const XMMRegister xmm_result = xmm0;
3144    const XMMRegister xmm_temp   = xmm1;
3145    // keys 0-10 preloaded into xmm2-xmm12
3146    const int XMM_REG_NUM_KEY_FIRST = 2;
3147    const int XMM_REG_NUM_KEY_LAST  = 12;
3148    const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3149    const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3150
3151    __ enter(); // required for proper stackwalking of RuntimeStub frame
3152
3153#ifdef _WIN64
3154    // on win64, fill len_reg from stack position
3155    __ movl(len_reg, len_mem);
3156    // save the xmm registers which must be preserved 6-12
3157    __ subptr(rsp, -rsp_after_call_off * wordSize);
3158    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3159      __ movdqu(xmm_save(i), as_XMMRegister(i));
3160    }
3161#endif
3162
3163    const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3164    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3165    // load up xmm regs 2 thru 12 with key 0x00 - 0xa0
3166    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
3167      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3168      offset += 0x10;
3169    }
3170
3171    __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3172
3173    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3174    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3175    __ cmpl(rax, 44);
3176    __ jcc(Assembler::notEqual, L_key_192_256);
3177
3178    // 128 bit code follows here
3179    __ movptr(pos, 0);
3180    __ align(OptoLoopAlignment);
3181    __ BIND(L_loopTop_128);
3182    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3183    __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3184
3185    __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3186    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3187      __ aesenc(xmm_result, as_XMMRegister(rnum));
3188    }
3189    __ aesenclast(xmm_result, xmm_key10);
3190
3191    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3192    // no need to store r to memory until we exit
3193    __ addptr(pos, AESBlockSize);
3194    __ subptr(len_reg, AESBlockSize);
3195    __ jcc(Assembler::notEqual, L_loopTop_128);
3196
3197    __ BIND(L_exit);
3198    __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3199
3200#ifdef _WIN64
3201    // restore xmm regs belonging to calling function
3202    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3203      __ movdqu(as_XMMRegister(i), xmm_save(i));
3204    }
3205#endif
3206    __ movl(rax, 0); // return 0 (why?)
3207    __ leave(); // required for proper stackwalking of RuntimeStub frame
3208    __ ret(0);
3209
3210    __ BIND(L_key_192_256);
3211    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3212    __ cmpl(rax, 52);
3213    __ jcc(Assembler::notEqual, L_key_256);
3214
3215    // 192-bit code follows here (could be changed to use more xmm registers)
3216    __ movptr(pos, 0);
3217    __ align(OptoLoopAlignment);
3218    __ BIND(L_loopTop_192);
3219    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3220    __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3221
3222    __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3223    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
3224      __ aesenc(xmm_result, as_XMMRegister(rnum));
3225    }
3226    aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
3227    load_key(xmm_temp, key, 0xc0);
3228    __ aesenclast(xmm_result, xmm_temp);
3229
3230    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3231    // no need to store r to memory until we exit
3232    __ addptr(pos, AESBlockSize);
3233    __ subptr(len_reg, AESBlockSize);
3234    __ jcc(Assembler::notEqual, L_loopTop_192);
3235    __ jmp(L_exit);
3236
3237    __ BIND(L_key_256);
3238    // 256-bit code follows here (could be changed to use more xmm registers)
3239    __ movptr(pos, 0);
3240    __ align(OptoLoopAlignment);
3241    __ BIND(L_loopTop_256);
3242    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3243    __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3244
3245    __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3246    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
3247      __ aesenc(xmm_result, as_XMMRegister(rnum));
3248    }
3249    aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
3250    aes_enc_key(xmm_result, xmm_temp, key, 0xc0);
3251    aes_enc_key(xmm_result, xmm_temp, key, 0xd0);
3252    load_key(xmm_temp, key, 0xe0);
3253    __ aesenclast(xmm_result, xmm_temp);
3254
3255    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3256    // no need to store r to memory until we exit
3257    __ addptr(pos, AESBlockSize);
3258    __ subptr(len_reg, AESBlockSize);
3259    __ jcc(Assembler::notEqual, L_loopTop_256);
3260    __ jmp(L_exit);
3261
3262    return start;
3263  }
3264
3265
3266
3267  // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3268  // to hide instruction latency
3269  //
3270  // Arguments:
3271  //
3272  // Inputs:
3273  //   c_rarg0   - source byte array address
3274  //   c_rarg1   - destination byte array address
3275  //   c_rarg2   - K (key) in little endian int array
3276  //   c_rarg3   - r vector byte array address
3277  //   c_rarg4   - input length
3278  //
3279
3280  address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3281    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
3282    __ align(CodeEntryAlignment);
3283    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3284    address start = __ pc();
3285
3286    Label L_exit, L_key_192_256, L_key_256;
3287    Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128;
3288    Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
3289    const Register from        = c_rarg0;  // source array address
3290    const Register to          = c_rarg1;  // destination array address
3291    const Register key         = c_rarg2;  // key array address
3292    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3293                                           // and left with the results of the last encryption block
3294#ifndef _WIN64
3295    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3296#else
3297    const Address  len_mem(rsp, 6 * wordSize);  // length is on stack on Win64
3298    const Register len_reg     = r10;      // pick the first volatile windows register
3299#endif
3300    const Register pos         = rax;
3301
3302    // xmm register assignments for the loops below
3303    const XMMRegister xmm_result = xmm0;
3304    // keys 0-10 preloaded into xmm2-xmm12
3305    const int XMM_REG_NUM_KEY_FIRST = 5;
3306    const int XMM_REG_NUM_KEY_LAST  = 15;
3307    const XMMRegister xmm_key_first   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3308    const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3309
3310    __ enter(); // required for proper stackwalking of RuntimeStub frame
3311
3312#ifdef _WIN64
3313    // on win64, fill len_reg from stack position
3314    __ movl(len_reg, len_mem);
3315    // save the xmm registers which must be preserved 6-15
3316    __ subptr(rsp, -rsp_after_call_off * wordSize);
3317    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3318      __ movdqu(xmm_save(i), as_XMMRegister(i));
3319    }
3320#endif
3321    // the java expanded key ordering is rotated one position from what we want
3322    // so we start from 0x10 here and hit 0x00 last
3323    const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3324    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3325    // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3326    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
3327      if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00;
3328      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3329      offset += 0x10;
3330    }
3331
3332    const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3333    // registers holding the four results in the parallelized loop
3334    const XMMRegister xmm_result0 = xmm0;
3335    const XMMRegister xmm_result1 = xmm2;
3336    const XMMRegister xmm_result2 = xmm3;
3337    const XMMRegister xmm_result3 = xmm4;
3338
3339    __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3340
3341    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3342    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3343    __ cmpl(rax, 44);
3344    __ jcc(Assembler::notEqual, L_key_192_256);
3345
3346
3347    // 128-bit code follows here, parallelized
3348    __ movptr(pos, 0);
3349    __ align(OptoLoopAlignment);
3350    __ BIND(L_multiBlock_loopTop_128);
3351    __ cmpptr(len_reg, 4*AESBlockSize);           // see if at least 4 blocks left
3352    __ jcc(Assembler::less, L_singleBlock_loopTop_128);
3353
3354    __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0*AESBlockSize));   // get next 4 blocks into xmmresult registers
3355    __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1*AESBlockSize));
3356    __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2*AESBlockSize));
3357    __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3*AESBlockSize));
3358
3359#define DoFour(opc, src_reg)                    \
3360    __ opc(xmm_result0, src_reg);               \
3361    __ opc(xmm_result1, src_reg);               \
3362    __ opc(xmm_result2, src_reg);               \
3363    __ opc(xmm_result3, src_reg);
3364
3365    DoFour(pxor, xmm_key_first);
3366    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3367      DoFour(aesdec, as_XMMRegister(rnum));
3368    }
3369    DoFour(aesdeclast, xmm_key_last);
3370    // for each result, xor with the r vector of previous cipher block
3371    __ pxor(xmm_result0, xmm_prev_block_cipher);
3372    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize));
3373    __ pxor(xmm_result1, xmm_prev_block_cipher);
3374    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize));
3375    __ pxor(xmm_result2, xmm_prev_block_cipher);
3376    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize));
3377    __ pxor(xmm_result3, xmm_prev_block_cipher);
3378    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize));   // this will carry over to next set of blocks
3379
3380    __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
3381    __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1);
3382    __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2);
3383    __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3);
3384
3385    __ addptr(pos, 4*AESBlockSize);
3386    __ subptr(len_reg, 4*AESBlockSize);
3387    __ jmp(L_multiBlock_loopTop_128);
3388
3389    // registers used in the non-parallelized loops
3390    const XMMRegister xmm_prev_block_cipher_save = xmm2;
3391    const XMMRegister xmm_temp   = xmm3;
3392
3393    __ align(OptoLoopAlignment);
3394    __ BIND(L_singleBlock_loopTop_128);
3395    __ cmpptr(len_reg, 0);           // any blocks left??
3396    __ jcc(Assembler::equal, L_exit);
3397    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
3398    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
3399    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
3400    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3401      __ aesdec(xmm_result, as_XMMRegister(rnum));
3402    }
3403    __ aesdeclast(xmm_result, xmm_key_last);
3404    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
3405    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3406    // no need to store r to memory until we exit
3407    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
3408
3409    __ addptr(pos, AESBlockSize);
3410    __ subptr(len_reg, AESBlockSize);
3411    __ jmp(L_singleBlock_loopTop_128);
3412
3413
3414    __ BIND(L_exit);
3415    __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
3416#ifdef _WIN64
3417    // restore regs belonging to calling function
3418    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3419      __ movdqu(as_XMMRegister(i), xmm_save(i));
3420    }
3421#endif
3422    __ movl(rax, 0); // return 0 (why?)
3423    __ leave(); // required for proper stackwalking of RuntimeStub frame
3424    __ ret(0);
3425
3426
3427    __ BIND(L_key_192_256);
3428    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3429    __ cmpl(rax, 52);
3430    __ jcc(Assembler::notEqual, L_key_256);
3431
3432    // 192-bit code follows here (could be optimized to use parallelism)
3433    __ movptr(pos, 0);
3434    __ align(OptoLoopAlignment);
3435    __ BIND(L_singleBlock_loopTop_192);
3436    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
3437    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
3438    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
3439    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3440      __ aesdec(xmm_result, as_XMMRegister(rnum));
3441    }
3442    aes_dec_key(xmm_result, xmm_temp, key, 0xb0);     // 192-bit key goes up to c0
3443    aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
3444    __ aesdeclast(xmm_result, xmm_key_last);                    // xmm15 always came from key+0
3445    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
3446    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3447    // no need to store r to memory until we exit
3448    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
3449
3450    __ addptr(pos, AESBlockSize);
3451    __ subptr(len_reg, AESBlockSize);
3452    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
3453    __ jmp(L_exit);
3454
3455    __ BIND(L_key_256);
3456    // 256-bit code follows here (could be optimized to use parallelism)
3457    __ movptr(pos, 0);
3458    __ align(OptoLoopAlignment);
3459    __ BIND(L_singleBlock_loopTop_256);
3460    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
3461    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
3462    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
3463    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3464      __ aesdec(xmm_result, as_XMMRegister(rnum));
3465    }
3466    aes_dec_key(xmm_result, xmm_temp, key, 0xb0);     // 256-bit key goes up to e0
3467    aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
3468    aes_dec_key(xmm_result, xmm_temp, key, 0xd0);
3469    aes_dec_key(xmm_result, xmm_temp, key, 0xe0);
3470    __ aesdeclast(xmm_result, xmm_key_last);             // xmm15 came from key+0
3471    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
3472    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3473    // no need to store r to memory until we exit
3474    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
3475
3476    __ addptr(pos, AESBlockSize);
3477    __ subptr(len_reg, AESBlockSize);
3478    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
3479    __ jmp(L_exit);
3480
3481    return start;
3482  }
3483
3484
3485
3486#undef __
3487#define __ masm->
3488
3489  // Continuation point for throwing of implicit exceptions that are
3490  // not handled in the current activation. Fabricates an exception
3491  // oop and initiates normal exception dispatching in this
3492  // frame. Since we need to preserve callee-saved values (currently
3493  // only for C2, but done for C1 as well) we need a callee-saved oop
3494  // map and therefore have to make these stubs into RuntimeStubs
3495  // rather than BufferBlobs.  If the compiler needs all registers to
3496  // be preserved between the fault point and the exception handler
3497  // then it must assume responsibility for that in
3498  // AbstractCompiler::continuation_for_implicit_null_exception or
3499  // continuation_for_implicit_division_by_zero_exception. All other
3500  // implicit exceptions (e.g., NullPointerException or
3501  // AbstractMethodError on entry) are either at call sites or
3502  // otherwise assume that stack unwinding will be initiated, so
3503  // caller saved registers were assumed volatile in the compiler.
3504  address generate_throw_exception(const char* name,
3505                                   address runtime_entry,
3506                                   Register arg1 = noreg,
3507                                   Register arg2 = noreg) {
3508    // Information about frame layout at time of blocking runtime call.
3509    // Note that we only have to preserve callee-saved registers since
3510    // the compilers are responsible for supplying a continuation point
3511    // if they expect all registers to be preserved.
3512    enum layout {
3513      rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3514      rbp_off2,
3515      return_off,
3516      return_off2,
3517      framesize // inclusive of return address
3518    };
3519
3520    int insts_size = 512;
3521    int locs_size  = 64;
3522
3523    CodeBuffer code(name, insts_size, locs_size);
3524    OopMapSet* oop_maps  = new OopMapSet();
3525    MacroAssembler* masm = new MacroAssembler(&code);
3526
3527    address start = __ pc();
3528
3529    // This is an inlined and slightly modified version of call_VM
3530    // which has the ability to fetch the return PC out of
3531    // thread-local storage and also sets up last_Java_sp slightly
3532    // differently than the real call_VM
3533
3534    __ enter(); // required for proper stackwalking of RuntimeStub frame
3535
3536    assert(is_even(framesize/2), "sp not 16-byte aligned");
3537
3538    // return address and rbp are already in place
3539    __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3540
3541    int frame_complete = __ pc() - start;
3542
3543    // Set up last_Java_sp and last_Java_fp
3544    address the_pc = __ pc();
3545    __ set_last_Java_frame(rsp, rbp, the_pc);
3546    __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3547
3548    // Call runtime
3549    if (arg1 != noreg) {
3550      assert(arg2 != c_rarg1, "clobbered");
3551      __ movptr(c_rarg1, arg1);
3552    }
3553    if (arg2 != noreg) {
3554      __ movptr(c_rarg2, arg2);
3555    }
3556    __ movptr(c_rarg0, r15_thread);
3557    BLOCK_COMMENT("call runtime_entry");
3558    __ call(RuntimeAddress(runtime_entry));
3559
3560    // Generate oop map
3561    OopMap* map = new OopMap(framesize, 0);
3562
3563    oop_maps->add_gc_map(the_pc - start, map);
3564
3565    __ reset_last_Java_frame(true, true);
3566
3567    __ leave(); // required for proper stackwalking of RuntimeStub frame
3568
3569    // check for pending exceptions
3570#ifdef ASSERT
3571    Label L;
3572    __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
3573            (int32_t) NULL_WORD);
3574    __ jcc(Assembler::notEqual, L);
3575    __ should_not_reach_here();
3576    __ bind(L);
3577#endif // ASSERT
3578    __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3579
3580
3581    // codeBlob framesize is in words (not VMRegImpl::slot_size)
3582    RuntimeStub* stub =
3583      RuntimeStub::new_runtime_stub(name,
3584                                    &code,
3585                                    frame_complete,
3586                                    (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3587                                    oop_maps, false);
3588    return stub->entry_point();
3589  }
3590
3591  // Initialization
3592  void generate_initial() {
3593    // Generates all stubs and initializes the entry points
3594
3595    // This platform-specific stub is needed by generate_call_stub()
3596    StubRoutines::x86::_mxcsr_std        = generate_fp_mask("mxcsr_std",        0x0000000000001F80);
3597
3598    // entry points that exist in all platforms Note: This is code
3599    // that could be shared among different platforms - however the
3600    // benefit seems to be smaller than the disadvantage of having a
3601    // much more complicated generator structure. See also comment in
3602    // stubRoutines.hpp.
3603
3604    StubRoutines::_forward_exception_entry = generate_forward_exception();
3605
3606    StubRoutines::_call_stub_entry =
3607      generate_call_stub(StubRoutines::_call_stub_return_address);
3608
3609    // is referenced by megamorphic call
3610    StubRoutines::_catch_exception_entry = generate_catch_exception();
3611
3612    // atomic calls
3613    StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
3614    StubRoutines::_atomic_xchg_ptr_entry     = generate_atomic_xchg_ptr();
3615    StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
3616    StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
3617    StubRoutines::_atomic_add_entry          = generate_atomic_add();
3618    StubRoutines::_atomic_add_ptr_entry      = generate_atomic_add_ptr();
3619    StubRoutines::_fence_entry               = generate_orderaccess_fence();
3620
3621    StubRoutines::_handler_for_unsafe_access_entry =
3622      generate_handler_for_unsafe_access();
3623
3624    // platform dependent
3625    StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp();
3626    StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
3627
3628    StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
3629
3630    // Build this early so it's available for the interpreter.
3631    StubRoutines::_throw_StackOverflowError_entry =
3632      generate_throw_exception("StackOverflowError throw_exception",
3633                               CAST_FROM_FN_PTR(address,
3634                                                SharedRuntime::
3635                                                throw_StackOverflowError));
3636  }
3637
3638  void generate_all() {
3639    // Generates all stubs and initializes the entry points
3640
3641    // These entry points require SharedInfo::stack0 to be set up in
3642    // non-core builds and need to be relocatable, so they each
3643    // fabricate a RuntimeStub internally.
3644    StubRoutines::_throw_AbstractMethodError_entry =
3645      generate_throw_exception("AbstractMethodError throw_exception",
3646                               CAST_FROM_FN_PTR(address,
3647                                                SharedRuntime::
3648                                                throw_AbstractMethodError));
3649
3650    StubRoutines::_throw_IncompatibleClassChangeError_entry =
3651      generate_throw_exception("IncompatibleClassChangeError throw_exception",
3652                               CAST_FROM_FN_PTR(address,
3653                                                SharedRuntime::
3654                                                throw_IncompatibleClassChangeError));
3655
3656    StubRoutines::_throw_NullPointerException_at_call_entry =
3657      generate_throw_exception("NullPointerException at call throw_exception",
3658                               CAST_FROM_FN_PTR(address,
3659                                                SharedRuntime::
3660                                                throw_NullPointerException_at_call));
3661
3662    // entry points that are platform specific
3663    StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
3664    StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
3665    StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
3666    StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
3667
3668    StubRoutines::x86::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
3669    StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
3670    StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
3671    StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
3672
3673    // support for verify_oop (must happen after universe_init)
3674    StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
3675
3676    // arraycopy stubs used by compilers
3677    generate_arraycopy_stubs();
3678
3679    generate_math_stubs();
3680
3681    // don't bother generating these AES intrinsic stubs unless global flag is set
3682    if (UseAESIntrinsics) {
3683      StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
3684
3685      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
3686      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
3687      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
3688      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
3689    }
3690  }
3691
3692 public:
3693  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3694    if (all) {
3695      generate_all();
3696    } else {
3697      generate_initial();
3698    }
3699  }
3700}; // end class declaration
3701
3702void StubGenerator_generate(CodeBuffer* code, bool all) {
3703  StubGenerator g(code, all);
3704}
3705