stubGenerator_x86_32.cpp revision 4965:980532a806a5
1/*
2 * Copyright (c) 1999, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "asm/macroAssembler.hpp"
27#include "asm/macroAssembler.inline.hpp"
28#include "interpreter/interpreter.hpp"
29#include "nativeInst_x86.hpp"
30#include "oops/instanceOop.hpp"
31#include "oops/method.hpp"
32#include "oops/objArrayKlass.hpp"
33#include "oops/oop.inline.hpp"
34#include "prims/methodHandles.hpp"
35#include "runtime/frame.inline.hpp"
36#include "runtime/handles.inline.hpp"
37#include "runtime/sharedRuntime.hpp"
38#include "runtime/stubCodeGenerator.hpp"
39#include "runtime/stubRoutines.hpp"
40#include "runtime/thread.inline.hpp"
41#include "utilities/top.hpp"
42#ifdef COMPILER2
43#include "opto/runtime.hpp"
44#endif
45
46// Declaration and definition of StubGenerator (no .hpp file).
47// For a more detailed description of the stub routine structure
48// see the comment in stubRoutines.hpp
49
50#define __ _masm->
51#define a__ ((Assembler*)_masm)->
52
53#ifdef PRODUCT
54#define BLOCK_COMMENT(str) /* nothing */
55#else
56#define BLOCK_COMMENT(str) __ block_comment(str)
57#endif
58
59#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
60
61const int MXCSR_MASK  = 0xFFC0;  // Mask out any pending exceptions
62const int FPU_CNTRL_WRD_MASK = 0xFFFF;
63
64// -------------------------------------------------------------------------------------------------------------------------
65// Stub Code definitions
66
67static address handle_unsafe_access() {
68  JavaThread* thread = JavaThread::current();
69  address pc  = thread->saved_exception_pc();
70  // pc is the instruction which we must emulate
71  // doing a no-op is fine:  return garbage from the load
72  // therefore, compute npc
73  address npc = Assembler::locate_next_instruction(pc);
74
75  // request an async exception
76  thread->set_pending_unsafe_access_error();
77
78  // return address of next instruction to execute
79  return npc;
80}
81
82class StubGenerator: public StubCodeGenerator {
83 private:
84
85#ifdef PRODUCT
86#define inc_counter_np(counter) ((void)0)
87#else
88  void inc_counter_np_(int& counter) {
89    __ incrementl(ExternalAddress((address)&counter));
90  }
91#define inc_counter_np(counter) \
92  BLOCK_COMMENT("inc_counter " #counter); \
93  inc_counter_np_(counter);
94#endif //PRODUCT
95
96  void inc_copy_counter_np(BasicType t) {
97#ifndef PRODUCT
98    switch (t) {
99    case T_BYTE:    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); return;
100    case T_SHORT:   inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); return;
101    case T_INT:     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); return;
102    case T_LONG:    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); return;
103    case T_OBJECT:  inc_counter_np(SharedRuntime::_oop_array_copy_ctr); return;
104    }
105    ShouldNotReachHere();
106#endif //PRODUCT
107  }
108
109  //------------------------------------------------------------------------------------------------------------------------
110  // Call stubs are used to call Java from C
111  //
112  //    [ return_from_Java     ] <--- rsp
113  //    [ argument word n      ]
114  //      ...
115  // -N [ argument word 1      ]
116  // -7 [ Possible padding for stack alignment ]
117  // -6 [ Possible padding for stack alignment ]
118  // -5 [ Possible padding for stack alignment ]
119  // -4 [ mxcsr save           ] <--- rsp_after_call
120  // -3 [ saved rbx,            ]
121  // -2 [ saved rsi            ]
122  // -1 [ saved rdi            ]
123  //  0 [ saved rbp,            ] <--- rbp,
124  //  1 [ return address       ]
125  //  2 [ ptr. to call wrapper ]
126  //  3 [ result               ]
127  //  4 [ result_type          ]
128  //  5 [ method               ]
129  //  6 [ entry_point          ]
130  //  7 [ parameters           ]
131  //  8 [ parameter_size       ]
132  //  9 [ thread               ]
133
134
135  address generate_call_stub(address& return_address) {
136    StubCodeMark mark(this, "StubRoutines", "call_stub");
137    address start = __ pc();
138
139    // stub code parameters / addresses
140    assert(frame::entry_frame_call_wrapper_offset == 2, "adjust this code");
141    bool  sse_save = false;
142    const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_catch_exception()!
143    const int     locals_count_in_bytes  (4*wordSize);
144    const Address mxcsr_save    (rbp, -4 * wordSize);
145    const Address saved_rbx     (rbp, -3 * wordSize);
146    const Address saved_rsi     (rbp, -2 * wordSize);
147    const Address saved_rdi     (rbp, -1 * wordSize);
148    const Address result        (rbp,  3 * wordSize);
149    const Address result_type   (rbp,  4 * wordSize);
150    const Address method        (rbp,  5 * wordSize);
151    const Address entry_point   (rbp,  6 * wordSize);
152    const Address parameters    (rbp,  7 * wordSize);
153    const Address parameter_size(rbp,  8 * wordSize);
154    const Address thread        (rbp,  9 * wordSize); // same as in generate_catch_exception()!
155    sse_save =  UseSSE > 0;
156
157    // stub code
158    __ enter();
159    __ movptr(rcx, parameter_size);              // parameter counter
160    __ shlptr(rcx, Interpreter::logStackElementSize); // convert parameter count to bytes
161    __ addptr(rcx, locals_count_in_bytes);       // reserve space for register saves
162    __ subptr(rsp, rcx);
163    __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
164
165    // save rdi, rsi, & rbx, according to C calling conventions
166    __ movptr(saved_rdi, rdi);
167    __ movptr(saved_rsi, rsi);
168    __ movptr(saved_rbx, rbx);
169    // save and initialize %mxcsr
170    if (sse_save) {
171      Label skip_ldmx;
172      __ stmxcsr(mxcsr_save);
173      __ movl(rax, mxcsr_save);
174      __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
175      ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
176      __ cmp32(rax, mxcsr_std);
177      __ jcc(Assembler::equal, skip_ldmx);
178      __ ldmxcsr(mxcsr_std);
179      __ bind(skip_ldmx);
180    }
181
182    // make sure the control word is correct.
183    __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
184
185#ifdef ASSERT
186    // make sure we have no pending exceptions
187    { Label L;
188      __ movptr(rcx, thread);
189      __ cmpptr(Address(rcx, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
190      __ jcc(Assembler::equal, L);
191      __ stop("StubRoutines::call_stub: entered with pending exception");
192      __ bind(L);
193    }
194#endif
195
196    // pass parameters if any
197    BLOCK_COMMENT("pass parameters if any");
198    Label parameters_done;
199    __ movl(rcx, parameter_size);  // parameter counter
200    __ testl(rcx, rcx);
201    __ jcc(Assembler::zero, parameters_done);
202
203    // parameter passing loop
204
205    Label loop;
206    // Copy Java parameters in reverse order (receiver last)
207    // Note that the argument order is inverted in the process
208    // source is rdx[rcx: N-1..0]
209    // dest   is rsp[rbx: 0..N-1]
210
211    __ movptr(rdx, parameters);          // parameter pointer
212    __ xorptr(rbx, rbx);
213
214    __ BIND(loop);
215
216    // get parameter
217    __ movptr(rax, Address(rdx, rcx, Interpreter::stackElementScale(), -wordSize));
218    __ movptr(Address(rsp, rbx, Interpreter::stackElementScale(),
219                    Interpreter::expr_offset_in_bytes(0)), rax);          // store parameter
220    __ increment(rbx);
221    __ decrement(rcx);
222    __ jcc(Assembler::notZero, loop);
223
224    // call Java function
225    __ BIND(parameters_done);
226    __ movptr(rbx, method);           // get Method*
227    __ movptr(rax, entry_point);      // get entry_point
228    __ mov(rsi, rsp);                 // set sender sp
229    BLOCK_COMMENT("call Java function");
230    __ call(rax);
231
232    BLOCK_COMMENT("call_stub_return_address:");
233    return_address = __ pc();
234
235#ifdef COMPILER2
236    {
237      Label L_skip;
238      if (UseSSE >= 2) {
239        __ verify_FPU(0, "call_stub_return");
240      } else {
241        for (int i = 1; i < 8; i++) {
242          __ ffree(i);
243        }
244
245        // UseSSE <= 1 so double result should be left on TOS
246        __ movl(rsi, result_type);
247        __ cmpl(rsi, T_DOUBLE);
248        __ jcc(Assembler::equal, L_skip);
249        if (UseSSE == 0) {
250          // UseSSE == 0 so float result should be left on TOS
251          __ cmpl(rsi, T_FLOAT);
252          __ jcc(Assembler::equal, L_skip);
253        }
254        __ ffree(0);
255      }
256      __ BIND(L_skip);
257    }
258#endif // COMPILER2
259
260    // store result depending on type
261    // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
262    __ movptr(rdi, result);
263    Label is_long, is_float, is_double, exit;
264    __ movl(rsi, result_type);
265    __ cmpl(rsi, T_LONG);
266    __ jcc(Assembler::equal, is_long);
267    __ cmpl(rsi, T_FLOAT);
268    __ jcc(Assembler::equal, is_float);
269    __ cmpl(rsi, T_DOUBLE);
270    __ jcc(Assembler::equal, is_double);
271
272    // handle T_INT case
273    __ movl(Address(rdi, 0), rax);
274    __ BIND(exit);
275
276    // check that FPU stack is empty
277    __ verify_FPU(0, "generate_call_stub");
278
279    // pop parameters
280    __ lea(rsp, rsp_after_call);
281
282    // restore %mxcsr
283    if (sse_save) {
284      __ ldmxcsr(mxcsr_save);
285    }
286
287    // restore rdi, rsi and rbx,
288    __ movptr(rbx, saved_rbx);
289    __ movptr(rsi, saved_rsi);
290    __ movptr(rdi, saved_rdi);
291    __ addptr(rsp, 4*wordSize);
292
293    // return
294    __ pop(rbp);
295    __ ret(0);
296
297    // handle return types different from T_INT
298    __ BIND(is_long);
299    __ movl(Address(rdi, 0 * wordSize), rax);
300    __ movl(Address(rdi, 1 * wordSize), rdx);
301    __ jmp(exit);
302
303    __ BIND(is_float);
304    // interpreter uses xmm0 for return values
305    if (UseSSE >= 1) {
306      __ movflt(Address(rdi, 0), xmm0);
307    } else {
308      __ fstp_s(Address(rdi, 0));
309    }
310    __ jmp(exit);
311
312    __ BIND(is_double);
313    // interpreter uses xmm0 for return values
314    if (UseSSE >= 2) {
315      __ movdbl(Address(rdi, 0), xmm0);
316    } else {
317      __ fstp_d(Address(rdi, 0));
318    }
319    __ jmp(exit);
320
321    return start;
322  }
323
324
325  //------------------------------------------------------------------------------------------------------------------------
326  // Return point for a Java call if there's an exception thrown in Java code.
327  // The exception is caught and transformed into a pending exception stored in
328  // JavaThread that can be tested from within the VM.
329  //
330  // Note: Usually the parameters are removed by the callee. In case of an exception
331  //       crossing an activation frame boundary, that is not the case if the callee
332  //       is compiled code => need to setup the rsp.
333  //
334  // rax,: exception oop
335
336  address generate_catch_exception() {
337    StubCodeMark mark(this, "StubRoutines", "catch_exception");
338    const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_call_stub()!
339    const Address thread        (rbp,  9 * wordSize); // same as in generate_call_stub()!
340    address start = __ pc();
341
342    // get thread directly
343    __ movptr(rcx, thread);
344#ifdef ASSERT
345    // verify that threads correspond
346    { Label L;
347      __ get_thread(rbx);
348      __ cmpptr(rbx, rcx);
349      __ jcc(Assembler::equal, L);
350      __ stop("StubRoutines::catch_exception: threads must correspond");
351      __ bind(L);
352    }
353#endif
354    // set pending exception
355    __ verify_oop(rax);
356    __ movptr(Address(rcx, Thread::pending_exception_offset()), rax          );
357    __ lea(Address(rcx, Thread::exception_file_offset   ()),
358           ExternalAddress((address)__FILE__));
359    __ movl(Address(rcx, Thread::exception_line_offset   ()), __LINE__ );
360    // complete return to VM
361    assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
362    __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
363
364    return start;
365  }
366
367
368  //------------------------------------------------------------------------------------------------------------------------
369  // Continuation point for runtime calls returning with a pending exception.
370  // The pending exception check happened in the runtime or native call stub.
371  // The pending exception in Thread is converted into a Java-level exception.
372  //
373  // Contract with Java-level exception handlers:
374  // rax: exception
375  // rdx: throwing pc
376  //
377  // NOTE: At entry of this stub, exception-pc must be on stack !!
378
379  address generate_forward_exception() {
380    StubCodeMark mark(this, "StubRoutines", "forward exception");
381    address start = __ pc();
382    const Register thread = rcx;
383
384    // other registers used in this stub
385    const Register exception_oop = rax;
386    const Register handler_addr  = rbx;
387    const Register exception_pc  = rdx;
388
389    // Upon entry, the sp points to the return address returning into Java
390    // (interpreted or compiled) code; i.e., the return address becomes the
391    // throwing pc.
392    //
393    // Arguments pushed before the runtime call are still on the stack but
394    // the exception handler will reset the stack pointer -> ignore them.
395    // A potential result in registers can be ignored as well.
396
397#ifdef ASSERT
398    // make sure this code is only executed if there is a pending exception
399    { Label L;
400      __ get_thread(thread);
401      __ cmpptr(Address(thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
402      __ jcc(Assembler::notEqual, L);
403      __ stop("StubRoutines::forward exception: no pending exception (1)");
404      __ bind(L);
405    }
406#endif
407
408    // compute exception handler into rbx,
409    __ get_thread(thread);
410    __ movptr(exception_pc, Address(rsp, 0));
411    BLOCK_COMMENT("call exception_handler_for_return_address");
412    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, exception_pc);
413    __ mov(handler_addr, rax);
414
415    // setup rax & rdx, remove return address & clear pending exception
416    __ get_thread(thread);
417    __ pop(exception_pc);
418    __ movptr(exception_oop, Address(thread, Thread::pending_exception_offset()));
419    __ movptr(Address(thread, Thread::pending_exception_offset()), NULL_WORD);
420
421#ifdef ASSERT
422    // make sure exception is set
423    { Label L;
424      __ testptr(exception_oop, exception_oop);
425      __ jcc(Assembler::notEqual, L);
426      __ stop("StubRoutines::forward exception: no pending exception (2)");
427      __ bind(L);
428    }
429#endif
430
431    // Verify that there is really a valid exception in RAX.
432    __ verify_oop(exception_oop);
433
434    // continue at exception handler (return address removed)
435    // rax: exception
436    // rbx: exception handler
437    // rdx: throwing pc
438    __ jmp(handler_addr);
439
440    return start;
441  }
442
443
444  //----------------------------------------------------------------------------------------------------
445  // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest)
446  //
447  // xchg exists as far back as 8086, lock needed for MP only
448  // Stack layout immediately after call:
449  //
450  // 0 [ret addr ] <--- rsp
451  // 1 [  ex     ]
452  // 2 [  dest   ]
453  //
454  // Result:   *dest <- ex, return (old *dest)
455  //
456  // Note: win32 does not currently use this code
457
458  address generate_atomic_xchg() {
459    StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
460    address start = __ pc();
461
462    __ push(rdx);
463    Address exchange(rsp, 2 * wordSize);
464    Address dest_addr(rsp, 3 * wordSize);
465    __ movl(rax, exchange);
466    __ movptr(rdx, dest_addr);
467    __ xchgl(rax, Address(rdx, 0));
468    __ pop(rdx);
469    __ ret(0);
470
471    return start;
472  }
473
474  //----------------------------------------------------------------------------------------------------
475  // Support for void verify_mxcsr()
476  //
477  // This routine is used with -Xcheck:jni to verify that native
478  // JNI code does not return to Java code without restoring the
479  // MXCSR register to our expected state.
480
481
482  address generate_verify_mxcsr() {
483    StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
484    address start = __ pc();
485
486    const Address mxcsr_save(rsp, 0);
487
488    if (CheckJNICalls && UseSSE > 0 ) {
489      Label ok_ret;
490      ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
491      __ push(rax);
492      __ subptr(rsp, wordSize);      // allocate a temp location
493      __ stmxcsr(mxcsr_save);
494      __ movl(rax, mxcsr_save);
495      __ andl(rax, MXCSR_MASK);
496      __ cmp32(rax, mxcsr_std);
497      __ jcc(Assembler::equal, ok_ret);
498
499      __ warn("MXCSR changed by native JNI code.");
500
501      __ ldmxcsr(mxcsr_std);
502
503      __ bind(ok_ret);
504      __ addptr(rsp, wordSize);
505      __ pop(rax);
506    }
507
508    __ ret(0);
509
510    return start;
511  }
512
513
514  //---------------------------------------------------------------------------
515  // Support for void verify_fpu_cntrl_wrd()
516  //
517  // This routine is used with -Xcheck:jni to verify that native
518  // JNI code does not return to Java code without restoring the
519  // FP control word to our expected state.
520
521  address generate_verify_fpu_cntrl_wrd() {
522    StubCodeMark mark(this, "StubRoutines", "verify_spcw");
523    address start = __ pc();
524
525    const Address fpu_cntrl_wrd_save(rsp, 0);
526
527    if (CheckJNICalls) {
528      Label ok_ret;
529      __ push(rax);
530      __ subptr(rsp, wordSize);      // allocate a temp location
531      __ fnstcw(fpu_cntrl_wrd_save);
532      __ movl(rax, fpu_cntrl_wrd_save);
533      __ andl(rax, FPU_CNTRL_WRD_MASK);
534      ExternalAddress fpu_std(StubRoutines::addr_fpu_cntrl_wrd_std());
535      __ cmp32(rax, fpu_std);
536      __ jcc(Assembler::equal, ok_ret);
537
538      __ warn("Floating point control word changed by native JNI code.");
539
540      __ fldcw(fpu_std);
541
542      __ bind(ok_ret);
543      __ addptr(rsp, wordSize);
544      __ pop(rax);
545    }
546
547    __ ret(0);
548
549    return start;
550  }
551
552  //---------------------------------------------------------------------------
553  // Wrapper for slow-case handling of double-to-integer conversion
554  // d2i or f2i fast case failed either because it is nan or because
555  // of under/overflow.
556  // Input:  FPU TOS: float value
557  // Output: rax, (rdx): integer (long) result
558
559  address generate_d2i_wrapper(BasicType t, address fcn) {
560    StubCodeMark mark(this, "StubRoutines", "d2i_wrapper");
561    address start = __ pc();
562
563  // Capture info about frame layout
564  enum layout { FPUState_off         = 0,
565                rbp_off              = FPUStateSizeInWords,
566                rdi_off,
567                rsi_off,
568                rcx_off,
569                rbx_off,
570                saved_argument_off,
571                saved_argument_off2, // 2nd half of double
572                framesize
573  };
574
575  assert(FPUStateSizeInWords == 27, "update stack layout");
576
577    // Save outgoing argument to stack across push_FPU_state()
578    __ subptr(rsp, wordSize * 2);
579    __ fstp_d(Address(rsp, 0));
580
581    // Save CPU & FPU state
582    __ push(rbx);
583    __ push(rcx);
584    __ push(rsi);
585    __ push(rdi);
586    __ push(rbp);
587    __ push_FPU_state();
588
589    // push_FPU_state() resets the FP top of stack
590    // Load original double into FP top of stack
591    __ fld_d(Address(rsp, saved_argument_off * wordSize));
592    // Store double into stack as outgoing argument
593    __ subptr(rsp, wordSize*2);
594    __ fst_d(Address(rsp, 0));
595
596    // Prepare FPU for doing math in C-land
597    __ empty_FPU_stack();
598    // Call the C code to massage the double.  Result in EAX
599    if (t == T_INT)
600      { BLOCK_COMMENT("SharedRuntime::d2i"); }
601    else if (t == T_LONG)
602      { BLOCK_COMMENT("SharedRuntime::d2l"); }
603    __ call_VM_leaf( fcn, 2 );
604
605    // Restore CPU & FPU state
606    __ pop_FPU_state();
607    __ pop(rbp);
608    __ pop(rdi);
609    __ pop(rsi);
610    __ pop(rcx);
611    __ pop(rbx);
612    __ addptr(rsp, wordSize * 2);
613
614    __ ret(0);
615
616    return start;
617  }
618
619
620  //---------------------------------------------------------------------------
621  // The following routine generates a subroutine to throw an asynchronous
622  // UnknownError when an unsafe access gets a fault that could not be
623  // reasonably prevented by the programmer.  (Example: SIGBUS/OBJERR.)
624  address generate_handler_for_unsafe_access() {
625    StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
626    address start = __ pc();
627
628    __ push(0);                       // hole for return address-to-be
629    __ pusha();                       // push registers
630    Address next_pc(rsp, RegisterImpl::number_of_registers * BytesPerWord);
631    BLOCK_COMMENT("call handle_unsafe_access");
632    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, handle_unsafe_access)));
633    __ movptr(next_pc, rax);          // stuff next address
634    __ popa();
635    __ ret(0);                        // jump to next address
636
637    return start;
638  }
639
640
641  //----------------------------------------------------------------------------------------------------
642  // Non-destructive plausibility checks for oops
643
644  address generate_verify_oop() {
645    StubCodeMark mark(this, "StubRoutines", "verify_oop");
646    address start = __ pc();
647
648    // Incoming arguments on stack after saving rax,:
649    //
650    // [tos    ]: saved rdx
651    // [tos + 1]: saved EFLAGS
652    // [tos + 2]: return address
653    // [tos + 3]: char* error message
654    // [tos + 4]: oop   object to verify
655    // [tos + 5]: saved rax, - saved by caller and bashed
656
657    Label exit, error;
658    __ pushf();
659    __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
660    __ push(rdx);                                // save rdx
661    // make sure object is 'reasonable'
662    __ movptr(rax, Address(rsp, 4 * wordSize));    // get object
663    __ testptr(rax, rax);
664    __ jcc(Assembler::zero, exit);               // if obj is NULL it is ok
665
666    // Check if the oop is in the right area of memory
667    const int oop_mask = Universe::verify_oop_mask();
668    const int oop_bits = Universe::verify_oop_bits();
669    __ mov(rdx, rax);
670    __ andptr(rdx, oop_mask);
671    __ cmpptr(rdx, oop_bits);
672    __ jcc(Assembler::notZero, error);
673
674    // make sure klass is 'reasonable', which is not zero.
675    __ movptr(rax, Address(rax, oopDesc::klass_offset_in_bytes())); // get klass
676    __ testptr(rax, rax);
677    __ jcc(Assembler::zero, error);              // if klass is NULL it is broken
678    // TODO: Future assert that klass is lower 4g memory for UseCompressedKlassPointers
679
680    // return if everything seems ok
681    __ bind(exit);
682    __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
683    __ pop(rdx);                                 // restore rdx
684    __ popf();                                   // restore EFLAGS
685    __ ret(3 * wordSize);                        // pop arguments
686
687    // handle errors
688    __ bind(error);
689    __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
690    __ pop(rdx);                                 // get saved rdx back
691    __ popf();                                   // get saved EFLAGS off stack -- will be ignored
692    __ pusha();                                  // push registers (eip = return address & msg are already pushed)
693    BLOCK_COMMENT("call MacroAssembler::debug");
694    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
695    __ popa();
696    __ ret(3 * wordSize);                        // pop arguments
697    return start;
698  }
699
700  //
701  //  Generate pre-barrier for array stores
702  //
703  //  Input:
704  //     start   -  starting address
705  //     count   -  element count
706  void  gen_write_ref_array_pre_barrier(Register start, Register count, bool uninitialized_target) {
707    assert_different_registers(start, count);
708    BarrierSet* bs = Universe::heap()->barrier_set();
709    switch (bs->kind()) {
710      case BarrierSet::G1SATBCT:
711      case BarrierSet::G1SATBCTLogging:
712        // With G1, don't generate the call if we statically know that the target in uninitialized
713        if (!uninitialized_target) {
714           __ pusha();                      // push registers
715           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre),
716                           start, count);
717           __ popa();
718         }
719        break;
720      case BarrierSet::CardTableModRef:
721      case BarrierSet::CardTableExtension:
722      case BarrierSet::ModRef:
723        break;
724      default      :
725        ShouldNotReachHere();
726
727    }
728  }
729
730
731  //
732  // Generate a post-barrier for an array store
733  //
734  //     start    -  starting address
735  //     count    -  element count
736  //
737  //  The two input registers are overwritten.
738  //
739  void  gen_write_ref_array_post_barrier(Register start, Register count) {
740    BarrierSet* bs = Universe::heap()->barrier_set();
741    assert_different_registers(start, count);
742    switch (bs->kind()) {
743      case BarrierSet::G1SATBCT:
744      case BarrierSet::G1SATBCTLogging:
745        {
746          __ pusha();                      // push registers
747          __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post),
748                          start, count);
749          __ popa();
750        }
751        break;
752
753      case BarrierSet::CardTableModRef:
754      case BarrierSet::CardTableExtension:
755        {
756          CardTableModRefBS* ct = (CardTableModRefBS*)bs;
757          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
758
759          Label L_loop;
760          const Register end = count;  // elements count; end == start+count-1
761          assert_different_registers(start, end);
762
763          __ lea(end,  Address(start, count, Address::times_ptr, -wordSize));
764          __ shrptr(start, CardTableModRefBS::card_shift);
765          __ shrptr(end,   CardTableModRefBS::card_shift);
766          __ subptr(end, start); // end --> count
767        __ BIND(L_loop);
768          intptr_t disp = (intptr_t) ct->byte_map_base;
769          Address cardtable(start, count, Address::times_1, disp);
770          __ movb(cardtable, 0);
771          __ decrement(count);
772          __ jcc(Assembler::greaterEqual, L_loop);
773        }
774        break;
775      case BarrierSet::ModRef:
776        break;
777      default      :
778        ShouldNotReachHere();
779
780    }
781  }
782
783
784  // Copy 64 bytes chunks
785  //
786  // Inputs:
787  //   from        - source array address
788  //   to_from     - destination array address - from
789  //   qword_count - 8-bytes element count, negative
790  //
791  void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
792    assert( UseSSE >= 2, "supported cpu only" );
793    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
794    // Copy 64-byte chunks
795    __ jmpb(L_copy_64_bytes);
796    __ align(OptoLoopAlignment);
797  __ BIND(L_copy_64_bytes_loop);
798
799    if (UseUnalignedLoadStores) {
800      if (UseAVX >= 2) {
801        __ vmovdqu(xmm0, Address(from,  0));
802        __ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
803        __ vmovdqu(xmm1, Address(from, 32));
804        __ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1);
805      } else {
806        __ movdqu(xmm0, Address(from, 0));
807        __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
808        __ movdqu(xmm1, Address(from, 16));
809        __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
810        __ movdqu(xmm2, Address(from, 32));
811        __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
812        __ movdqu(xmm3, Address(from, 48));
813        __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
814      }
815    } else {
816      __ movq(xmm0, Address(from, 0));
817      __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
818      __ movq(xmm1, Address(from, 8));
819      __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
820      __ movq(xmm2, Address(from, 16));
821      __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
822      __ movq(xmm3, Address(from, 24));
823      __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
824      __ movq(xmm4, Address(from, 32));
825      __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
826      __ movq(xmm5, Address(from, 40));
827      __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
828      __ movq(xmm6, Address(from, 48));
829      __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
830      __ movq(xmm7, Address(from, 56));
831      __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
832    }
833
834    __ addl(from, 64);
835  __ BIND(L_copy_64_bytes);
836    __ subl(qword_count, 8);
837    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
838
839    if (UseUnalignedLoadStores && (UseAVX >= 2)) {
840      // clean upper bits of YMM registers
841      __ vzeroupper();
842    }
843    __ addl(qword_count, 8);
844    __ jccb(Assembler::zero, L_exit);
845    //
846    // length is too short, just copy qwords
847    //
848  __ BIND(L_copy_8_bytes);
849    __ movq(xmm0, Address(from, 0));
850    __ movq(Address(from, to_from, Address::times_1), xmm0);
851    __ addl(from, 8);
852    __ decrement(qword_count);
853    __ jcc(Assembler::greater, L_copy_8_bytes);
854  __ BIND(L_exit);
855  }
856
857  // Copy 64 bytes chunks
858  //
859  // Inputs:
860  //   from        - source array address
861  //   to_from     - destination array address - from
862  //   qword_count - 8-bytes element count, negative
863  //
864  void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
865    assert( VM_Version::supports_mmx(), "supported cpu only" );
866    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
867    // Copy 64-byte chunks
868    __ jmpb(L_copy_64_bytes);
869    __ align(OptoLoopAlignment);
870  __ BIND(L_copy_64_bytes_loop);
871    __ movq(mmx0, Address(from, 0));
872    __ movq(mmx1, Address(from, 8));
873    __ movq(mmx2, Address(from, 16));
874    __ movq(Address(from, to_from, Address::times_1, 0), mmx0);
875    __ movq(mmx3, Address(from, 24));
876    __ movq(Address(from, to_from, Address::times_1, 8), mmx1);
877    __ movq(mmx4, Address(from, 32));
878    __ movq(Address(from, to_from, Address::times_1, 16), mmx2);
879    __ movq(mmx5, Address(from, 40));
880    __ movq(Address(from, to_from, Address::times_1, 24), mmx3);
881    __ movq(mmx6, Address(from, 48));
882    __ movq(Address(from, to_from, Address::times_1, 32), mmx4);
883    __ movq(mmx7, Address(from, 56));
884    __ movq(Address(from, to_from, Address::times_1, 40), mmx5);
885    __ movq(Address(from, to_from, Address::times_1, 48), mmx6);
886    __ movq(Address(from, to_from, Address::times_1, 56), mmx7);
887    __ addptr(from, 64);
888  __ BIND(L_copy_64_bytes);
889    __ subl(qword_count, 8);
890    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
891    __ addl(qword_count, 8);
892    __ jccb(Assembler::zero, L_exit);
893    //
894    // length is too short, just copy qwords
895    //
896  __ BIND(L_copy_8_bytes);
897    __ movq(mmx0, Address(from, 0));
898    __ movq(Address(from, to_from, Address::times_1), mmx0);
899    __ addptr(from, 8);
900    __ decrement(qword_count);
901    __ jcc(Assembler::greater, L_copy_8_bytes);
902  __ BIND(L_exit);
903    __ emms();
904  }
905
906  address generate_disjoint_copy(BasicType t, bool aligned,
907                                 Address::ScaleFactor sf,
908                                 address* entry, const char *name,
909                                 bool dest_uninitialized = false) {
910    __ align(CodeEntryAlignment);
911    StubCodeMark mark(this, "StubRoutines", name);
912    address start = __ pc();
913
914    Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
915    Label L_copy_2_bytes, L_copy_4_bytes, L_copy_64_bytes;
916
917    int shift = Address::times_ptr - sf;
918
919    const Register from     = rsi;  // source array address
920    const Register to       = rdi;  // destination array address
921    const Register count    = rcx;  // elements count
922    const Register to_from  = to;   // (to - from)
923    const Register saved_to = rdx;  // saved destination array address
924
925    __ enter(); // required for proper stackwalking of RuntimeStub frame
926    __ push(rsi);
927    __ push(rdi);
928    __ movptr(from , Address(rsp, 12+ 4));
929    __ movptr(to   , Address(rsp, 12+ 8));
930    __ movl(count, Address(rsp, 12+ 12));
931
932    if (entry != NULL) {
933      *entry = __ pc(); // Entry point from conjoint arraycopy stub.
934      BLOCK_COMMENT("Entry:");
935    }
936
937    if (t == T_OBJECT) {
938      __ testl(count, count);
939      __ jcc(Assembler::zero, L_0_count);
940      gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
941      __ mov(saved_to, to);          // save 'to'
942    }
943
944    __ subptr(to, from); // to --> to_from
945    __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
946    __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
947    if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
948      // align source address at 4 bytes address boundary
949      if (t == T_BYTE) {
950        // One byte misalignment happens only for byte arrays
951        __ testl(from, 1);
952        __ jccb(Assembler::zero, L_skip_align1);
953        __ movb(rax, Address(from, 0));
954        __ movb(Address(from, to_from, Address::times_1, 0), rax);
955        __ increment(from);
956        __ decrement(count);
957      __ BIND(L_skip_align1);
958      }
959      // Two bytes misalignment happens only for byte and short (char) arrays
960      __ testl(from, 2);
961      __ jccb(Assembler::zero, L_skip_align2);
962      __ movw(rax, Address(from, 0));
963      __ movw(Address(from, to_from, Address::times_1, 0), rax);
964      __ addptr(from, 2);
965      __ subl(count, 1<<(shift-1));
966    __ BIND(L_skip_align2);
967    }
968    if (!VM_Version::supports_mmx()) {
969      __ mov(rax, count);      // save 'count'
970      __ shrl(count, shift); // bytes count
971      __ addptr(to_from, from);// restore 'to'
972      __ rep_mov();
973      __ subptr(to_from, from);// restore 'to_from'
974      __ mov(count, rax);      // restore 'count'
975      __ jmpb(L_copy_2_bytes); // all dwords were copied
976    } else {
977      if (!UseUnalignedLoadStores) {
978        // align to 8 bytes, we know we are 4 byte aligned to start
979        __ testptr(from, 4);
980        __ jccb(Assembler::zero, L_copy_64_bytes);
981        __ movl(rax, Address(from, 0));
982        __ movl(Address(from, to_from, Address::times_1, 0), rax);
983        __ addptr(from, 4);
984        __ subl(count, 1<<shift);
985      }
986    __ BIND(L_copy_64_bytes);
987      __ mov(rax, count);
988      __ shrl(rax, shift+1);  // 8 bytes chunk count
989      //
990      // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop
991      //
992      if (UseXMMForArrayCopy) {
993        xmm_copy_forward(from, to_from, rax);
994      } else {
995        mmx_copy_forward(from, to_from, rax);
996      }
997    }
998    // copy tailing dword
999  __ BIND(L_copy_4_bytes);
1000    __ testl(count, 1<<shift);
1001    __ jccb(Assembler::zero, L_copy_2_bytes);
1002    __ movl(rax, Address(from, 0));
1003    __ movl(Address(from, to_from, Address::times_1, 0), rax);
1004    if (t == T_BYTE || t == T_SHORT) {
1005      __ addptr(from, 4);
1006    __ BIND(L_copy_2_bytes);
1007      // copy tailing word
1008      __ testl(count, 1<<(shift-1));
1009      __ jccb(Assembler::zero, L_copy_byte);
1010      __ movw(rax, Address(from, 0));
1011      __ movw(Address(from, to_from, Address::times_1, 0), rax);
1012      if (t == T_BYTE) {
1013        __ addptr(from, 2);
1014      __ BIND(L_copy_byte);
1015        // copy tailing byte
1016        __ testl(count, 1);
1017        __ jccb(Assembler::zero, L_exit);
1018        __ movb(rax, Address(from, 0));
1019        __ movb(Address(from, to_from, Address::times_1, 0), rax);
1020      __ BIND(L_exit);
1021      } else {
1022      __ BIND(L_copy_byte);
1023      }
1024    } else {
1025    __ BIND(L_copy_2_bytes);
1026    }
1027
1028    if (t == T_OBJECT) {
1029      __ movl(count, Address(rsp, 12+12)); // reread 'count'
1030      __ mov(to, saved_to); // restore 'to'
1031      gen_write_ref_array_post_barrier(to, count);
1032    __ BIND(L_0_count);
1033    }
1034    inc_copy_counter_np(t);
1035    __ pop(rdi);
1036    __ pop(rsi);
1037    __ leave(); // required for proper stackwalking of RuntimeStub frame
1038    __ xorptr(rax, rax); // return 0
1039    __ ret(0);
1040    return start;
1041  }
1042
1043
1044  address generate_fill(BasicType t, bool aligned, const char *name) {
1045    __ align(CodeEntryAlignment);
1046    StubCodeMark mark(this, "StubRoutines", name);
1047    address start = __ pc();
1048
1049    BLOCK_COMMENT("Entry:");
1050
1051    const Register to       = rdi;  // source array address
1052    const Register value    = rdx;  // value
1053    const Register count    = rsi;  // elements count
1054
1055    __ enter(); // required for proper stackwalking of RuntimeStub frame
1056    __ push(rsi);
1057    __ push(rdi);
1058    __ movptr(to   , Address(rsp, 12+ 4));
1059    __ movl(value, Address(rsp, 12+ 8));
1060    __ movl(count, Address(rsp, 12+ 12));
1061
1062    __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1063
1064    __ pop(rdi);
1065    __ pop(rsi);
1066    __ leave(); // required for proper stackwalking of RuntimeStub frame
1067    __ ret(0);
1068    return start;
1069  }
1070
1071  address generate_conjoint_copy(BasicType t, bool aligned,
1072                                 Address::ScaleFactor sf,
1073                                 address nooverlap_target,
1074                                 address* entry, const char *name,
1075                                 bool dest_uninitialized = false) {
1076    __ align(CodeEntryAlignment);
1077    StubCodeMark mark(this, "StubRoutines", name);
1078    address start = __ pc();
1079
1080    Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
1081    Label L_copy_2_bytes, L_copy_4_bytes, L_copy_8_bytes, L_copy_8_bytes_loop;
1082
1083    int shift = Address::times_ptr - sf;
1084
1085    const Register src   = rax;  // source array address
1086    const Register dst   = rdx;  // destination array address
1087    const Register from  = rsi;  // source array address
1088    const Register to    = rdi;  // destination array address
1089    const Register count = rcx;  // elements count
1090    const Register end   = rax;  // array end address
1091
1092    __ enter(); // required for proper stackwalking of RuntimeStub frame
1093    __ push(rsi);
1094    __ push(rdi);
1095    __ movptr(src  , Address(rsp, 12+ 4));   // from
1096    __ movptr(dst  , Address(rsp, 12+ 8));   // to
1097    __ movl2ptr(count, Address(rsp, 12+12)); // count
1098
1099    if (entry != NULL) {
1100      *entry = __ pc(); // Entry point from generic arraycopy stub.
1101      BLOCK_COMMENT("Entry:");
1102    }
1103
1104    // nooverlap_target expects arguments in rsi and rdi.
1105    __ mov(from, src);
1106    __ mov(to  , dst);
1107
1108    // arrays overlap test: dispatch to disjoint stub if necessary.
1109    RuntimeAddress nooverlap(nooverlap_target);
1110    __ cmpptr(dst, src);
1111    __ lea(end, Address(src, count, sf, 0)); // src + count * elem_size
1112    __ jump_cc(Assembler::belowEqual, nooverlap);
1113    __ cmpptr(dst, end);
1114    __ jump_cc(Assembler::aboveEqual, nooverlap);
1115
1116    if (t == T_OBJECT) {
1117      __ testl(count, count);
1118      __ jcc(Assembler::zero, L_0_count);
1119      gen_write_ref_array_pre_barrier(dst, count, dest_uninitialized);
1120    }
1121
1122    // copy from high to low
1123    __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1124    __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
1125    if (t == T_BYTE || t == T_SHORT) {
1126      // Align the end of destination array at 4 bytes address boundary
1127      __ lea(end, Address(dst, count, sf, 0));
1128      if (t == T_BYTE) {
1129        // One byte misalignment happens only for byte arrays
1130        __ testl(end, 1);
1131        __ jccb(Assembler::zero, L_skip_align1);
1132        __ decrement(count);
1133        __ movb(rdx, Address(from, count, sf, 0));
1134        __ movb(Address(to, count, sf, 0), rdx);
1135      __ BIND(L_skip_align1);
1136      }
1137      // Two bytes misalignment happens only for byte and short (char) arrays
1138      __ testl(end, 2);
1139      __ jccb(Assembler::zero, L_skip_align2);
1140      __ subptr(count, 1<<(shift-1));
1141      __ movw(rdx, Address(from, count, sf, 0));
1142      __ movw(Address(to, count, sf, 0), rdx);
1143    __ BIND(L_skip_align2);
1144      __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1145      __ jcc(Assembler::below, L_copy_4_bytes);
1146    }
1147
1148    if (!VM_Version::supports_mmx()) {
1149      __ std();
1150      __ mov(rax, count); // Save 'count'
1151      __ mov(rdx, to);    // Save 'to'
1152      __ lea(rsi, Address(from, count, sf, -4));
1153      __ lea(rdi, Address(to  , count, sf, -4));
1154      __ shrptr(count, shift); // bytes count
1155      __ rep_mov();
1156      __ cld();
1157      __ mov(count, rax); // restore 'count'
1158      __ andl(count, (1<<shift)-1);      // mask the number of rest elements
1159      __ movptr(from, Address(rsp, 12+4)); // reread 'from'
1160      __ mov(to, rdx);   // restore 'to'
1161      __ jmpb(L_copy_2_bytes); // all dword were copied
1162   } else {
1163      // Align to 8 bytes the end of array. It is aligned to 4 bytes already.
1164      __ testptr(end, 4);
1165      __ jccb(Assembler::zero, L_copy_8_bytes);
1166      __ subl(count, 1<<shift);
1167      __ movl(rdx, Address(from, count, sf, 0));
1168      __ movl(Address(to, count, sf, 0), rdx);
1169      __ jmpb(L_copy_8_bytes);
1170
1171      __ align(OptoLoopAlignment);
1172      // Move 8 bytes
1173    __ BIND(L_copy_8_bytes_loop);
1174      if (UseXMMForArrayCopy) {
1175        __ movq(xmm0, Address(from, count, sf, 0));
1176        __ movq(Address(to, count, sf, 0), xmm0);
1177      } else {
1178        __ movq(mmx0, Address(from, count, sf, 0));
1179        __ movq(Address(to, count, sf, 0), mmx0);
1180      }
1181    __ BIND(L_copy_8_bytes);
1182      __ subl(count, 2<<shift);
1183      __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1184      __ addl(count, 2<<shift);
1185      if (!UseXMMForArrayCopy) {
1186        __ emms();
1187      }
1188    }
1189  __ BIND(L_copy_4_bytes);
1190    // copy prefix qword
1191    __ testl(count, 1<<shift);
1192    __ jccb(Assembler::zero, L_copy_2_bytes);
1193    __ movl(rdx, Address(from, count, sf, -4));
1194    __ movl(Address(to, count, sf, -4), rdx);
1195
1196    if (t == T_BYTE || t == T_SHORT) {
1197        __ subl(count, (1<<shift));
1198      __ BIND(L_copy_2_bytes);
1199        // copy prefix dword
1200        __ testl(count, 1<<(shift-1));
1201        __ jccb(Assembler::zero, L_copy_byte);
1202        __ movw(rdx, Address(from, count, sf, -2));
1203        __ movw(Address(to, count, sf, -2), rdx);
1204        if (t == T_BYTE) {
1205          __ subl(count, 1<<(shift-1));
1206        __ BIND(L_copy_byte);
1207          // copy prefix byte
1208          __ testl(count, 1);
1209          __ jccb(Assembler::zero, L_exit);
1210          __ movb(rdx, Address(from, 0));
1211          __ movb(Address(to, 0), rdx);
1212        __ BIND(L_exit);
1213        } else {
1214        __ BIND(L_copy_byte);
1215        }
1216    } else {
1217    __ BIND(L_copy_2_bytes);
1218    }
1219    if (t == T_OBJECT) {
1220      __ movl2ptr(count, Address(rsp, 12+12)); // reread count
1221      gen_write_ref_array_post_barrier(to, count);
1222    __ BIND(L_0_count);
1223    }
1224    inc_copy_counter_np(t);
1225    __ pop(rdi);
1226    __ pop(rsi);
1227    __ leave(); // required for proper stackwalking of RuntimeStub frame
1228    __ xorptr(rax, rax); // return 0
1229    __ ret(0);
1230    return start;
1231  }
1232
1233
1234  address generate_disjoint_long_copy(address* entry, const char *name) {
1235    __ align(CodeEntryAlignment);
1236    StubCodeMark mark(this, "StubRoutines", name);
1237    address start = __ pc();
1238
1239    Label L_copy_8_bytes, L_copy_8_bytes_loop;
1240    const Register from       = rax;  // source array address
1241    const Register to         = rdx;  // destination array address
1242    const Register count      = rcx;  // elements count
1243    const Register to_from    = rdx;  // (to - from)
1244
1245    __ enter(); // required for proper stackwalking of RuntimeStub frame
1246    __ movptr(from , Address(rsp, 8+0));       // from
1247    __ movptr(to   , Address(rsp, 8+4));       // to
1248    __ movl2ptr(count, Address(rsp, 8+8));     // count
1249
1250    *entry = __ pc(); // Entry point from conjoint arraycopy stub.
1251    BLOCK_COMMENT("Entry:");
1252
1253    __ subptr(to, from); // to --> to_from
1254    if (VM_Version::supports_mmx()) {
1255      if (UseXMMForArrayCopy) {
1256        xmm_copy_forward(from, to_from, count);
1257      } else {
1258        mmx_copy_forward(from, to_from, count);
1259      }
1260    } else {
1261      __ jmpb(L_copy_8_bytes);
1262      __ align(OptoLoopAlignment);
1263    __ BIND(L_copy_8_bytes_loop);
1264      __ fild_d(Address(from, 0));
1265      __ fistp_d(Address(from, to_from, Address::times_1));
1266      __ addptr(from, 8);
1267    __ BIND(L_copy_8_bytes);
1268      __ decrement(count);
1269      __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1270    }
1271    inc_copy_counter_np(T_LONG);
1272    __ leave(); // required for proper stackwalking of RuntimeStub frame
1273    __ xorptr(rax, rax); // return 0
1274    __ ret(0);
1275    return start;
1276  }
1277
1278  address generate_conjoint_long_copy(address nooverlap_target,
1279                                      address* entry, const char *name) {
1280    __ align(CodeEntryAlignment);
1281    StubCodeMark mark(this, "StubRoutines", name);
1282    address start = __ pc();
1283
1284    Label L_copy_8_bytes, L_copy_8_bytes_loop;
1285    const Register from       = rax;  // source array address
1286    const Register to         = rdx;  // destination array address
1287    const Register count      = rcx;  // elements count
1288    const Register end_from   = rax;  // source array end address
1289
1290    __ enter(); // required for proper stackwalking of RuntimeStub frame
1291    __ movptr(from , Address(rsp, 8+0));       // from
1292    __ movptr(to   , Address(rsp, 8+4));       // to
1293    __ movl2ptr(count, Address(rsp, 8+8));     // count
1294
1295    *entry = __ pc(); // Entry point from generic arraycopy stub.
1296    BLOCK_COMMENT("Entry:");
1297
1298    // arrays overlap test
1299    __ cmpptr(to, from);
1300    RuntimeAddress nooverlap(nooverlap_target);
1301    __ jump_cc(Assembler::belowEqual, nooverlap);
1302    __ lea(end_from, Address(from, count, Address::times_8, 0));
1303    __ cmpptr(to, end_from);
1304    __ movptr(from, Address(rsp, 8));  // from
1305    __ jump_cc(Assembler::aboveEqual, nooverlap);
1306
1307    __ jmpb(L_copy_8_bytes);
1308
1309    __ align(OptoLoopAlignment);
1310  __ BIND(L_copy_8_bytes_loop);
1311    if (VM_Version::supports_mmx()) {
1312      if (UseXMMForArrayCopy) {
1313        __ movq(xmm0, Address(from, count, Address::times_8));
1314        __ movq(Address(to, count, Address::times_8), xmm0);
1315      } else {
1316        __ movq(mmx0, Address(from, count, Address::times_8));
1317        __ movq(Address(to, count, Address::times_8), mmx0);
1318      }
1319    } else {
1320      __ fild_d(Address(from, count, Address::times_8));
1321      __ fistp_d(Address(to, count, Address::times_8));
1322    }
1323  __ BIND(L_copy_8_bytes);
1324    __ decrement(count);
1325    __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1326
1327    if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
1328      __ emms();
1329    }
1330    inc_copy_counter_np(T_LONG);
1331    __ leave(); // required for proper stackwalking of RuntimeStub frame
1332    __ xorptr(rax, rax); // return 0
1333    __ ret(0);
1334    return start;
1335  }
1336
1337
1338  // Helper for generating a dynamic type check.
1339  // The sub_klass must be one of {rbx, rdx, rsi}.
1340  // The temp is killed.
1341  void generate_type_check(Register sub_klass,
1342                           Address& super_check_offset_addr,
1343                           Address& super_klass_addr,
1344                           Register temp,
1345                           Label* L_success, Label* L_failure) {
1346    BLOCK_COMMENT("type_check:");
1347
1348    Label L_fallthrough;
1349#define LOCAL_JCC(assembler_con, label_ptr)                             \
1350    if (label_ptr != NULL)  __ jcc(assembler_con, *(label_ptr));        \
1351    else                    __ jcc(assembler_con, L_fallthrough) /*omit semi*/
1352
1353    // The following is a strange variation of the fast path which requires
1354    // one less register, because needed values are on the argument stack.
1355    // __ check_klass_subtype_fast_path(sub_klass, *super_klass*, temp,
1356    //                                  L_success, L_failure, NULL);
1357    assert_different_registers(sub_klass, temp);
1358
1359    int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1360
1361    // if the pointers are equal, we are done (e.g., String[] elements)
1362    __ cmpptr(sub_klass, super_klass_addr);
1363    LOCAL_JCC(Assembler::equal, L_success);
1364
1365    // check the supertype display:
1366    __ movl2ptr(temp, super_check_offset_addr);
1367    Address super_check_addr(sub_klass, temp, Address::times_1, 0);
1368    __ movptr(temp, super_check_addr); // load displayed supertype
1369    __ cmpptr(temp, super_klass_addr); // test the super type
1370    LOCAL_JCC(Assembler::equal, L_success);
1371
1372    // if it was a primary super, we can just fail immediately
1373    __ cmpl(super_check_offset_addr, sc_offset);
1374    LOCAL_JCC(Assembler::notEqual, L_failure);
1375
1376    // The repne_scan instruction uses fixed registers, which will get spilled.
1377    // We happen to know this works best when super_klass is in rax.
1378    Register super_klass = temp;
1379    __ movptr(super_klass, super_klass_addr);
1380    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg,
1381                                     L_success, L_failure);
1382
1383    __ bind(L_fallthrough);
1384
1385    if (L_success == NULL) { BLOCK_COMMENT("L_success:"); }
1386    if (L_failure == NULL) { BLOCK_COMMENT("L_failure:"); }
1387
1388#undef LOCAL_JCC
1389  }
1390
1391  //
1392  //  Generate checkcasting array copy stub
1393  //
1394  //  Input:
1395  //    4(rsp)   - source array address
1396  //    8(rsp)   - destination array address
1397  //   12(rsp)   - element count, can be zero
1398  //   16(rsp)   - size_t ckoff (super_check_offset)
1399  //   20(rsp)   - oop ckval (super_klass)
1400  //
1401  //  Output:
1402  //    rax, ==  0  -  success
1403  //    rax, == -1^K - failure, where K is partial transfer count
1404  //
1405  address generate_checkcast_copy(const char *name, address* entry, bool dest_uninitialized = false) {
1406    __ align(CodeEntryAlignment);
1407    StubCodeMark mark(this, "StubRoutines", name);
1408    address start = __ pc();
1409
1410    Label L_load_element, L_store_element, L_do_card_marks, L_done;
1411
1412    // register use:
1413    //  rax, rdx, rcx -- loop control (end_from, end_to, count)
1414    //  rdi, rsi      -- element access (oop, klass)
1415    //  rbx,           -- temp
1416    const Register from       = rax;    // source array address
1417    const Register to         = rdx;    // destination array address
1418    const Register length     = rcx;    // elements count
1419    const Register elem       = rdi;    // each oop copied
1420    const Register elem_klass = rsi;    // each elem._klass (sub_klass)
1421    const Register temp       = rbx;    // lone remaining temp
1422
1423    __ enter(); // required for proper stackwalking of RuntimeStub frame
1424
1425    __ push(rsi);
1426    __ push(rdi);
1427    __ push(rbx);
1428
1429    Address   from_arg(rsp, 16+ 4);     // from
1430    Address     to_arg(rsp, 16+ 8);     // to
1431    Address length_arg(rsp, 16+12);     // elements count
1432    Address  ckoff_arg(rsp, 16+16);     // super_check_offset
1433    Address  ckval_arg(rsp, 16+20);     // super_klass
1434
1435    // Load up:
1436    __ movptr(from,     from_arg);
1437    __ movptr(to,         to_arg);
1438    __ movl2ptr(length, length_arg);
1439
1440    if (entry != NULL) {
1441      *entry = __ pc(); // Entry point from generic arraycopy stub.
1442      BLOCK_COMMENT("Entry:");
1443    }
1444
1445    //---------------------------------------------------------------
1446    // Assembler stub will be used for this call to arraycopy
1447    // if the two arrays are subtypes of Object[] but the
1448    // destination array type is not equal to or a supertype
1449    // of the source type.  Each element must be separately
1450    // checked.
1451
1452    // Loop-invariant addresses.  They are exclusive end pointers.
1453    Address end_from_addr(from, length, Address::times_ptr, 0);
1454    Address   end_to_addr(to,   length, Address::times_ptr, 0);
1455
1456    Register end_from = from;           // re-use
1457    Register end_to   = to;             // re-use
1458    Register count    = length;         // re-use
1459
1460    // Loop-variant addresses.  They assume post-incremented count < 0.
1461    Address from_element_addr(end_from, count, Address::times_ptr, 0);
1462    Address   to_element_addr(end_to,   count, Address::times_ptr, 0);
1463    Address elem_klass_addr(elem, oopDesc::klass_offset_in_bytes());
1464
1465    // Copy from low to high addresses, indexed from the end of each array.
1466    gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1467    __ lea(end_from, end_from_addr);
1468    __ lea(end_to,   end_to_addr);
1469    assert(length == count, "");        // else fix next line:
1470    __ negptr(count);                   // negate and test the length
1471    __ jccb(Assembler::notZero, L_load_element);
1472
1473    // Empty array:  Nothing to do.
1474    __ xorptr(rax, rax);                  // return 0 on (trivial) success
1475    __ jmp(L_done);
1476
1477    // ======== begin loop ========
1478    // (Loop is rotated; its entry is L_load_element.)
1479    // Loop control:
1480    //   for (count = -count; count != 0; count++)
1481    // Base pointers src, dst are biased by 8*count,to last element.
1482    __ align(OptoLoopAlignment);
1483
1484    __ BIND(L_store_element);
1485    __ movptr(to_element_addr, elem);     // store the oop
1486    __ increment(count);                // increment the count toward zero
1487    __ jccb(Assembler::zero, L_do_card_marks);
1488
1489    // ======== loop entry is here ========
1490    __ BIND(L_load_element);
1491    __ movptr(elem, from_element_addr);   // load the oop
1492    __ testptr(elem, elem);
1493    __ jccb(Assembler::zero, L_store_element);
1494
1495    // (Could do a trick here:  Remember last successful non-null
1496    // element stored and make a quick oop equality check on it.)
1497
1498    __ movptr(elem_klass, elem_klass_addr); // query the object klass
1499    generate_type_check(elem_klass, ckoff_arg, ckval_arg, temp,
1500                        &L_store_element, NULL);
1501    // (On fall-through, we have failed the element type check.)
1502    // ======== end loop ========
1503
1504    // It was a real error; we must depend on the caller to finish the job.
1505    // Register "count" = -1 * number of *remaining* oops, length_arg = *total* oops.
1506    // Emit GC store barriers for the oops we have copied (length_arg + count),
1507    // and report their number to the caller.
1508    assert_different_registers(to, count, rax);
1509    Label L_post_barrier;
1510    __ addl(count, length_arg);         // transfers = (length - remaining)
1511    __ movl2ptr(rax, count);            // save the value
1512    __ notptr(rax);                     // report (-1^K) to caller (does not affect flags)
1513    __ jccb(Assembler::notZero, L_post_barrier);
1514    __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
1515
1516    // Come here on success only.
1517    __ BIND(L_do_card_marks);
1518    __ xorptr(rax, rax);                // return 0 on success
1519    __ movl2ptr(count, length_arg);
1520
1521    __ BIND(L_post_barrier);
1522    __ movptr(to, to_arg);              // reload
1523    gen_write_ref_array_post_barrier(to, count);
1524
1525    // Common exit point (success or failure).
1526    __ BIND(L_done);
1527    __ pop(rbx);
1528    __ pop(rdi);
1529    __ pop(rsi);
1530    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1531    __ leave(); // required for proper stackwalking of RuntimeStub frame
1532    __ ret(0);
1533
1534    return start;
1535  }
1536
1537  //
1538  //  Generate 'unsafe' array copy stub
1539  //  Though just as safe as the other stubs, it takes an unscaled
1540  //  size_t argument instead of an element count.
1541  //
1542  //  Input:
1543  //    4(rsp)   - source array address
1544  //    8(rsp)   - destination array address
1545  //   12(rsp)   - byte count, can be zero
1546  //
1547  //  Output:
1548  //    rax, ==  0  -  success
1549  //    rax, == -1  -  need to call System.arraycopy
1550  //
1551  // Examines the alignment of the operands and dispatches
1552  // to a long, int, short, or byte copy loop.
1553  //
1554  address generate_unsafe_copy(const char *name,
1555                               address byte_copy_entry,
1556                               address short_copy_entry,
1557                               address int_copy_entry,
1558                               address long_copy_entry) {
1559
1560    Label L_long_aligned, L_int_aligned, L_short_aligned;
1561
1562    __ align(CodeEntryAlignment);
1563    StubCodeMark mark(this, "StubRoutines", name);
1564    address start = __ pc();
1565
1566    const Register from       = rax;  // source array address
1567    const Register to         = rdx;  // destination array address
1568    const Register count      = rcx;  // elements count
1569
1570    __ enter(); // required for proper stackwalking of RuntimeStub frame
1571    __ push(rsi);
1572    __ push(rdi);
1573    Address  from_arg(rsp, 12+ 4);      // from
1574    Address    to_arg(rsp, 12+ 8);      // to
1575    Address count_arg(rsp, 12+12);      // byte count
1576
1577    // Load up:
1578    __ movptr(from ,  from_arg);
1579    __ movptr(to   ,    to_arg);
1580    __ movl2ptr(count, count_arg);
1581
1582    // bump this on entry, not on exit:
1583    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1584
1585    const Register bits = rsi;
1586    __ mov(bits, from);
1587    __ orptr(bits, to);
1588    __ orptr(bits, count);
1589
1590    __ testl(bits, BytesPerLong-1);
1591    __ jccb(Assembler::zero, L_long_aligned);
1592
1593    __ testl(bits, BytesPerInt-1);
1594    __ jccb(Assembler::zero, L_int_aligned);
1595
1596    __ testl(bits, BytesPerShort-1);
1597    __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
1598
1599    __ BIND(L_short_aligned);
1600    __ shrptr(count, LogBytesPerShort); // size => short_count
1601    __ movl(count_arg, count);          // update 'count'
1602    __ jump(RuntimeAddress(short_copy_entry));
1603
1604    __ BIND(L_int_aligned);
1605    __ shrptr(count, LogBytesPerInt); // size => int_count
1606    __ movl(count_arg, count);          // update 'count'
1607    __ jump(RuntimeAddress(int_copy_entry));
1608
1609    __ BIND(L_long_aligned);
1610    __ shrptr(count, LogBytesPerLong); // size => qword_count
1611    __ movl(count_arg, count);          // update 'count'
1612    __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1613    __ pop(rsi);
1614    __ jump(RuntimeAddress(long_copy_entry));
1615
1616    return start;
1617  }
1618
1619
1620  // Perform range checks on the proposed arraycopy.
1621  // Smashes src_pos and dst_pos.  (Uses them up for temps.)
1622  void arraycopy_range_checks(Register src,
1623                              Register src_pos,
1624                              Register dst,
1625                              Register dst_pos,
1626                              Address& length,
1627                              Label& L_failed) {
1628    BLOCK_COMMENT("arraycopy_range_checks:");
1629    const Register src_end = src_pos;   // source array end position
1630    const Register dst_end = dst_pos;   // destination array end position
1631    __ addl(src_end, length); // src_pos + length
1632    __ addl(dst_end, length); // dst_pos + length
1633
1634    //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
1635    __ cmpl(src_end, Address(src, arrayOopDesc::length_offset_in_bytes()));
1636    __ jcc(Assembler::above, L_failed);
1637
1638    //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
1639    __ cmpl(dst_end, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1640    __ jcc(Assembler::above, L_failed);
1641
1642    BLOCK_COMMENT("arraycopy_range_checks done");
1643  }
1644
1645
1646  //
1647  //  Generate generic array copy stubs
1648  //
1649  //  Input:
1650  //     4(rsp)    -  src oop
1651  //     8(rsp)    -  src_pos
1652  //    12(rsp)    -  dst oop
1653  //    16(rsp)    -  dst_pos
1654  //    20(rsp)    -  element count
1655  //
1656  //  Output:
1657  //    rax, ==  0  -  success
1658  //    rax, == -1^K - failure, where K is partial transfer count
1659  //
1660  address generate_generic_copy(const char *name,
1661                                address entry_jbyte_arraycopy,
1662                                address entry_jshort_arraycopy,
1663                                address entry_jint_arraycopy,
1664                                address entry_oop_arraycopy,
1665                                address entry_jlong_arraycopy,
1666                                address entry_checkcast_arraycopy) {
1667    Label L_failed, L_failed_0, L_objArray;
1668
1669    { int modulus = CodeEntryAlignment;
1670      int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
1671      int advance = target - (__ offset() % modulus);
1672      if (advance < 0)  advance += modulus;
1673      if (advance > 0)  __ nop(advance);
1674    }
1675    StubCodeMark mark(this, "StubRoutines", name);
1676
1677    // Short-hop target to L_failed.  Makes for denser prologue code.
1678    __ BIND(L_failed_0);
1679    __ jmp(L_failed);
1680    assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
1681
1682    __ align(CodeEntryAlignment);
1683    address start = __ pc();
1684
1685    __ enter(); // required for proper stackwalking of RuntimeStub frame
1686    __ push(rsi);
1687    __ push(rdi);
1688
1689    // bump this on entry, not on exit:
1690    inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1691
1692    // Input values
1693    Address SRC     (rsp, 12+ 4);
1694    Address SRC_POS (rsp, 12+ 8);
1695    Address DST     (rsp, 12+12);
1696    Address DST_POS (rsp, 12+16);
1697    Address LENGTH  (rsp, 12+20);
1698
1699    //-----------------------------------------------------------------------
1700    // Assembler stub will be used for this call to arraycopy
1701    // if the following conditions are met:
1702    //
1703    // (1) src and dst must not be null.
1704    // (2) src_pos must not be negative.
1705    // (3) dst_pos must not be negative.
1706    // (4) length  must not be negative.
1707    // (5) src klass and dst klass should be the same and not NULL.
1708    // (6) src and dst should be arrays.
1709    // (7) src_pos + length must not exceed length of src.
1710    // (8) dst_pos + length must not exceed length of dst.
1711    //
1712
1713    const Register src     = rax;       // source array oop
1714    const Register src_pos = rsi;
1715    const Register dst     = rdx;       // destination array oop
1716    const Register dst_pos = rdi;
1717    const Register length  = rcx;       // transfer count
1718
1719    //  if (src == NULL) return -1;
1720    __ movptr(src, SRC);      // src oop
1721    __ testptr(src, src);
1722    __ jccb(Assembler::zero, L_failed_0);
1723
1724    //  if (src_pos < 0) return -1;
1725    __ movl2ptr(src_pos, SRC_POS);  // src_pos
1726    __ testl(src_pos, src_pos);
1727    __ jccb(Assembler::negative, L_failed_0);
1728
1729    //  if (dst == NULL) return -1;
1730    __ movptr(dst, DST);      // dst oop
1731    __ testptr(dst, dst);
1732    __ jccb(Assembler::zero, L_failed_0);
1733
1734    //  if (dst_pos < 0) return -1;
1735    __ movl2ptr(dst_pos, DST_POS);  // dst_pos
1736    __ testl(dst_pos, dst_pos);
1737    __ jccb(Assembler::negative, L_failed_0);
1738
1739    //  if (length < 0) return -1;
1740    __ movl2ptr(length, LENGTH);   // length
1741    __ testl(length, length);
1742    __ jccb(Assembler::negative, L_failed_0);
1743
1744    //  if (src->klass() == NULL) return -1;
1745    Address src_klass_addr(src, oopDesc::klass_offset_in_bytes());
1746    Address dst_klass_addr(dst, oopDesc::klass_offset_in_bytes());
1747    const Register rcx_src_klass = rcx;    // array klass
1748    __ movptr(rcx_src_klass, Address(src, oopDesc::klass_offset_in_bytes()));
1749
1750#ifdef ASSERT
1751    //  assert(src->klass() != NULL);
1752    BLOCK_COMMENT("assert klasses not null");
1753    { Label L1, L2;
1754      __ testptr(rcx_src_klass, rcx_src_klass);
1755      __ jccb(Assembler::notZero, L2);   // it is broken if klass is NULL
1756      __ bind(L1);
1757      __ stop("broken null klass");
1758      __ bind(L2);
1759      __ cmpptr(dst_klass_addr, (int32_t)NULL_WORD);
1760      __ jccb(Assembler::equal, L1);      // this would be broken also
1761      BLOCK_COMMENT("assert done");
1762    }
1763#endif //ASSERT
1764
1765    // Load layout helper (32-bits)
1766    //
1767    //  |array_tag|     | header_size | element_type |     |log2_element_size|
1768    // 32        30    24            16              8     2                 0
1769    //
1770    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1771    //
1772
1773    int lh_offset = in_bytes(Klass::layout_helper_offset());
1774    Address src_klass_lh_addr(rcx_src_klass, lh_offset);
1775
1776    // Handle objArrays completely differently...
1777    jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1778    __ cmpl(src_klass_lh_addr, objArray_lh);
1779    __ jcc(Assembler::equal, L_objArray);
1780
1781    //  if (src->klass() != dst->klass()) return -1;
1782    __ cmpptr(rcx_src_klass, dst_klass_addr);
1783    __ jccb(Assembler::notEqual, L_failed_0);
1784
1785    const Register rcx_lh = rcx;  // layout helper
1786    assert(rcx_lh == rcx_src_klass, "known alias");
1787    __ movl(rcx_lh, src_klass_lh_addr);
1788
1789    //  if (!src->is_Array()) return -1;
1790    __ cmpl(rcx_lh, Klass::_lh_neutral_value);
1791    __ jcc(Assembler::greaterEqual, L_failed_0); // signed cmp
1792
1793    // At this point, it is known to be a typeArray (array_tag 0x3).
1794#ifdef ASSERT
1795    { Label L;
1796      __ cmpl(rcx_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1797      __ jcc(Assembler::greaterEqual, L); // signed cmp
1798      __ stop("must be a primitive array");
1799      __ bind(L);
1800    }
1801#endif
1802
1803    assert_different_registers(src, src_pos, dst, dst_pos, rcx_lh);
1804    arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1805
1806    // TypeArrayKlass
1807    //
1808    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
1809    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
1810    //
1811    const Register rsi_offset = rsi; // array offset
1812    const Register src_array  = src; // src array offset
1813    const Register dst_array  = dst; // dst array offset
1814    const Register rdi_elsize = rdi; // log2 element size
1815
1816    __ mov(rsi_offset, rcx_lh);
1817    __ shrptr(rsi_offset, Klass::_lh_header_size_shift);
1818    __ andptr(rsi_offset, Klass::_lh_header_size_mask);   // array_offset
1819    __ addptr(src_array, rsi_offset);  // src array offset
1820    __ addptr(dst_array, rsi_offset);  // dst array offset
1821    __ andptr(rcx_lh, Klass::_lh_log2_element_size_mask); // log2 elsize
1822
1823    // next registers should be set before the jump to corresponding stub
1824    const Register from       = src; // source array address
1825    const Register to         = dst; // destination array address
1826    const Register count      = rcx; // elements count
1827    // some of them should be duplicated on stack
1828#define FROM   Address(rsp, 12+ 4)
1829#define TO     Address(rsp, 12+ 8)   // Not used now
1830#define COUNT  Address(rsp, 12+12)   // Only for oop arraycopy
1831
1832    BLOCK_COMMENT("scale indexes to element size");
1833    __ movl2ptr(rsi, SRC_POS);  // src_pos
1834    __ shlptr(rsi);             // src_pos << rcx (log2 elsize)
1835    assert(src_array == from, "");
1836    __ addptr(from, rsi);       // from = src_array + SRC_POS << log2 elsize
1837    __ movl2ptr(rdi, DST_POS);  // dst_pos
1838    __ shlptr(rdi);             // dst_pos << rcx (log2 elsize)
1839    assert(dst_array == to, "");
1840    __ addptr(to,  rdi);        // to   = dst_array + DST_POS << log2 elsize
1841    __ movptr(FROM, from);      // src_addr
1842    __ mov(rdi_elsize, rcx_lh); // log2 elsize
1843    __ movl2ptr(count, LENGTH); // elements count
1844
1845    BLOCK_COMMENT("choose copy loop based on element size");
1846    __ cmpl(rdi_elsize, 0);
1847
1848    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jbyte_arraycopy));
1849    __ cmpl(rdi_elsize, LogBytesPerShort);
1850    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jshort_arraycopy));
1851    __ cmpl(rdi_elsize, LogBytesPerInt);
1852    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jint_arraycopy));
1853#ifdef ASSERT
1854    __ cmpl(rdi_elsize, LogBytesPerLong);
1855    __ jccb(Assembler::notEqual, L_failed);
1856#endif
1857    __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1858    __ pop(rsi);
1859    __ jump(RuntimeAddress(entry_jlong_arraycopy));
1860
1861  __ BIND(L_failed);
1862    __ xorptr(rax, rax);
1863    __ notptr(rax); // return -1
1864    __ pop(rdi);
1865    __ pop(rsi);
1866    __ leave(); // required for proper stackwalking of RuntimeStub frame
1867    __ ret(0);
1868
1869    // ObjArrayKlass
1870  __ BIND(L_objArray);
1871    // live at this point:  rcx_src_klass, src[_pos], dst[_pos]
1872
1873    Label L_plain_copy, L_checkcast_copy;
1874    //  test array classes for subtyping
1875    __ cmpptr(rcx_src_klass, dst_klass_addr); // usual case is exact equality
1876    __ jccb(Assembler::notEqual, L_checkcast_copy);
1877
1878    // Identically typed arrays can be copied without element-wise checks.
1879    assert_different_registers(src, src_pos, dst, dst_pos, rcx_src_klass);
1880    arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1881
1882  __ BIND(L_plain_copy);
1883    __ movl2ptr(count, LENGTH); // elements count
1884    __ movl2ptr(src_pos, SRC_POS);  // reload src_pos
1885    __ lea(from, Address(src, src_pos, Address::times_ptr,
1886                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
1887    __ movl2ptr(dst_pos, DST_POS);  // reload dst_pos
1888    __ lea(to,   Address(dst, dst_pos, Address::times_ptr,
1889                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
1890    __ movptr(FROM,  from);   // src_addr
1891    __ movptr(TO,    to);     // dst_addr
1892    __ movl(COUNT, count);  // count
1893    __ jump(RuntimeAddress(entry_oop_arraycopy));
1894
1895  __ BIND(L_checkcast_copy);
1896    // live at this point:  rcx_src_klass, dst[_pos], src[_pos]
1897    {
1898      // Handy offsets:
1899      int  ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1900      int sco_offset = in_bytes(Klass::super_check_offset_offset());
1901
1902      Register rsi_dst_klass = rsi;
1903      Register rdi_temp      = rdi;
1904      assert(rsi_dst_klass == src_pos, "expected alias w/ src_pos");
1905      assert(rdi_temp      == dst_pos, "expected alias w/ dst_pos");
1906      Address dst_klass_lh_addr(rsi_dst_klass, lh_offset);
1907
1908      // Before looking at dst.length, make sure dst is also an objArray.
1909      __ movptr(rsi_dst_klass, dst_klass_addr);
1910      __ cmpl(dst_klass_lh_addr, objArray_lh);
1911      __ jccb(Assembler::notEqual, L_failed);
1912
1913      // It is safe to examine both src.length and dst.length.
1914      __ movl2ptr(src_pos, SRC_POS);        // reload rsi
1915      arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1916      // (Now src_pos and dst_pos are killed, but not src and dst.)
1917
1918      // We'll need this temp (don't forget to pop it after the type check).
1919      __ push(rbx);
1920      Register rbx_src_klass = rbx;
1921
1922      __ mov(rbx_src_klass, rcx_src_klass); // spill away from rcx
1923      __ movptr(rsi_dst_klass, dst_klass_addr);
1924      Address super_check_offset_addr(rsi_dst_klass, sco_offset);
1925      Label L_fail_array_check;
1926      generate_type_check(rbx_src_klass,
1927                          super_check_offset_addr, dst_klass_addr,
1928                          rdi_temp, NULL, &L_fail_array_check);
1929      // (On fall-through, we have passed the array type check.)
1930      __ pop(rbx);
1931      __ jmp(L_plain_copy);
1932
1933      __ BIND(L_fail_array_check);
1934      // Reshuffle arguments so we can call checkcast_arraycopy:
1935
1936      // match initial saves for checkcast_arraycopy
1937      // push(rsi);    // already done; see above
1938      // push(rdi);    // already done; see above
1939      // push(rbx);    // already done; see above
1940
1941      // Marshal outgoing arguments now, freeing registers.
1942      Address   from_arg(rsp, 16+ 4);   // from
1943      Address     to_arg(rsp, 16+ 8);   // to
1944      Address length_arg(rsp, 16+12);   // elements count
1945      Address  ckoff_arg(rsp, 16+16);   // super_check_offset
1946      Address  ckval_arg(rsp, 16+20);   // super_klass
1947
1948      Address SRC_POS_arg(rsp, 16+ 8);
1949      Address DST_POS_arg(rsp, 16+16);
1950      Address  LENGTH_arg(rsp, 16+20);
1951      // push rbx, changed the incoming offsets (why not just use rbp,??)
1952      // assert(SRC_POS_arg.disp() == SRC_POS.disp() + 4, "");
1953
1954      __ movptr(rbx, Address(rsi_dst_klass, ek_offset));
1955      __ movl2ptr(length, LENGTH_arg);    // reload elements count
1956      __ movl2ptr(src_pos, SRC_POS_arg);  // reload src_pos
1957      __ movl2ptr(dst_pos, DST_POS_arg);  // reload dst_pos
1958
1959      __ movptr(ckval_arg, rbx);          // destination element type
1960      __ movl(rbx, Address(rbx, sco_offset));
1961      __ movl(ckoff_arg, rbx);          // corresponding class check offset
1962
1963      __ movl(length_arg, length);      // outgoing length argument
1964
1965      __ lea(from, Address(src, src_pos, Address::times_ptr,
1966                            arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
1967      __ movptr(from_arg, from);
1968
1969      __ lea(to, Address(dst, dst_pos, Address::times_ptr,
1970                          arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
1971      __ movptr(to_arg, to);
1972      __ jump(RuntimeAddress(entry_checkcast_arraycopy));
1973    }
1974
1975    return start;
1976  }
1977
1978  void generate_arraycopy_stubs() {
1979    address entry;
1980    address entry_jbyte_arraycopy;
1981    address entry_jshort_arraycopy;
1982    address entry_jint_arraycopy;
1983    address entry_oop_arraycopy;
1984    address entry_jlong_arraycopy;
1985    address entry_checkcast_arraycopy;
1986
1987    StubRoutines::_arrayof_jbyte_disjoint_arraycopy =
1988        generate_disjoint_copy(T_BYTE,  true, Address::times_1, &entry,
1989                               "arrayof_jbyte_disjoint_arraycopy");
1990    StubRoutines::_arrayof_jbyte_arraycopy =
1991        generate_conjoint_copy(T_BYTE,  true, Address::times_1,  entry,
1992                               NULL, "arrayof_jbyte_arraycopy");
1993    StubRoutines::_jbyte_disjoint_arraycopy =
1994        generate_disjoint_copy(T_BYTE, false, Address::times_1, &entry,
1995                               "jbyte_disjoint_arraycopy");
1996    StubRoutines::_jbyte_arraycopy =
1997        generate_conjoint_copy(T_BYTE, false, Address::times_1,  entry,
1998                               &entry_jbyte_arraycopy, "jbyte_arraycopy");
1999
2000    StubRoutines::_arrayof_jshort_disjoint_arraycopy =
2001        generate_disjoint_copy(T_SHORT,  true, Address::times_2, &entry,
2002                               "arrayof_jshort_disjoint_arraycopy");
2003    StubRoutines::_arrayof_jshort_arraycopy =
2004        generate_conjoint_copy(T_SHORT,  true, Address::times_2,  entry,
2005                               NULL, "arrayof_jshort_arraycopy");
2006    StubRoutines::_jshort_disjoint_arraycopy =
2007        generate_disjoint_copy(T_SHORT, false, Address::times_2, &entry,
2008                               "jshort_disjoint_arraycopy");
2009    StubRoutines::_jshort_arraycopy =
2010        generate_conjoint_copy(T_SHORT, false, Address::times_2,  entry,
2011                               &entry_jshort_arraycopy, "jshort_arraycopy");
2012
2013    // Next arrays are always aligned on 4 bytes at least.
2014    StubRoutines::_jint_disjoint_arraycopy =
2015        generate_disjoint_copy(T_INT, true, Address::times_4, &entry,
2016                               "jint_disjoint_arraycopy");
2017    StubRoutines::_jint_arraycopy =
2018        generate_conjoint_copy(T_INT, true, Address::times_4,  entry,
2019                               &entry_jint_arraycopy, "jint_arraycopy");
2020
2021    StubRoutines::_oop_disjoint_arraycopy =
2022        generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2023                               "oop_disjoint_arraycopy");
2024    StubRoutines::_oop_arraycopy =
2025        generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
2026                               &entry_oop_arraycopy, "oop_arraycopy");
2027
2028    StubRoutines::_oop_disjoint_arraycopy_uninit =
2029        generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2030                               "oop_disjoint_arraycopy_uninit",
2031                               /*dest_uninitialized*/true);
2032    StubRoutines::_oop_arraycopy_uninit =
2033        generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
2034                               NULL, "oop_arraycopy_uninit",
2035                               /*dest_uninitialized*/true);
2036
2037    StubRoutines::_jlong_disjoint_arraycopy =
2038        generate_disjoint_long_copy(&entry, "jlong_disjoint_arraycopy");
2039    StubRoutines::_jlong_arraycopy =
2040        generate_conjoint_long_copy(entry, &entry_jlong_arraycopy,
2041                                    "jlong_arraycopy");
2042
2043    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2044    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2045    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2046    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2047    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2048    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2049
2050    StubRoutines::_arrayof_jint_disjoint_arraycopy       = StubRoutines::_jint_disjoint_arraycopy;
2051    StubRoutines::_arrayof_oop_disjoint_arraycopy        = StubRoutines::_oop_disjoint_arraycopy;
2052    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
2053    StubRoutines::_arrayof_jlong_disjoint_arraycopy      = StubRoutines::_jlong_disjoint_arraycopy;
2054
2055    StubRoutines::_arrayof_jint_arraycopy       = StubRoutines::_jint_arraycopy;
2056    StubRoutines::_arrayof_oop_arraycopy        = StubRoutines::_oop_arraycopy;
2057    StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
2058    StubRoutines::_arrayof_jlong_arraycopy      = StubRoutines::_jlong_arraycopy;
2059
2060    StubRoutines::_checkcast_arraycopy =
2061        generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2062    StubRoutines::_checkcast_arraycopy_uninit =
2063        generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, /*dest_uninitialized*/true);
2064
2065    StubRoutines::_unsafe_arraycopy =
2066        generate_unsafe_copy("unsafe_arraycopy",
2067                               entry_jbyte_arraycopy,
2068                               entry_jshort_arraycopy,
2069                               entry_jint_arraycopy,
2070                               entry_jlong_arraycopy);
2071
2072    StubRoutines::_generic_arraycopy =
2073        generate_generic_copy("generic_arraycopy",
2074                               entry_jbyte_arraycopy,
2075                               entry_jshort_arraycopy,
2076                               entry_jint_arraycopy,
2077                               entry_oop_arraycopy,
2078                               entry_jlong_arraycopy,
2079                               entry_checkcast_arraycopy);
2080  }
2081
2082  void generate_math_stubs() {
2083    {
2084      StubCodeMark mark(this, "StubRoutines", "log");
2085      StubRoutines::_intrinsic_log = (double (*)(double)) __ pc();
2086
2087      __ fld_d(Address(rsp, 4));
2088      __ flog();
2089      __ ret(0);
2090    }
2091    {
2092      StubCodeMark mark(this, "StubRoutines", "log10");
2093      StubRoutines::_intrinsic_log10 = (double (*)(double)) __ pc();
2094
2095      __ fld_d(Address(rsp, 4));
2096      __ flog10();
2097      __ ret(0);
2098    }
2099    {
2100      StubCodeMark mark(this, "StubRoutines", "sin");
2101      StubRoutines::_intrinsic_sin = (double (*)(double))  __ pc();
2102
2103      __ fld_d(Address(rsp, 4));
2104      __ trigfunc('s');
2105      __ ret(0);
2106    }
2107    {
2108      StubCodeMark mark(this, "StubRoutines", "cos");
2109      StubRoutines::_intrinsic_cos = (double (*)(double)) __ pc();
2110
2111      __ fld_d(Address(rsp, 4));
2112      __ trigfunc('c');
2113      __ ret(0);
2114    }
2115    {
2116      StubCodeMark mark(this, "StubRoutines", "tan");
2117      StubRoutines::_intrinsic_tan = (double (*)(double)) __ pc();
2118
2119      __ fld_d(Address(rsp, 4));
2120      __ trigfunc('t');
2121      __ ret(0);
2122    }
2123    {
2124      StubCodeMark mark(this, "StubRoutines", "exp");
2125      StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc();
2126
2127      __ fld_d(Address(rsp, 4));
2128      __ exp_with_fallback(0);
2129      __ ret(0);
2130    }
2131    {
2132      StubCodeMark mark(this, "StubRoutines", "pow");
2133      StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
2134
2135      __ fld_d(Address(rsp, 12));
2136      __ fld_d(Address(rsp, 4));
2137      __ pow_with_fallback(0);
2138      __ ret(0);
2139    }
2140  }
2141
2142  // AES intrinsic stubs
2143  enum {AESBlockSize = 16};
2144
2145  address generate_key_shuffle_mask() {
2146    __ align(16);
2147    StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2148    address start = __ pc();
2149    __ emit_data(0x00010203, relocInfo::none, 0 );
2150    __ emit_data(0x04050607, relocInfo::none, 0 );
2151    __ emit_data(0x08090a0b, relocInfo::none, 0 );
2152    __ emit_data(0x0c0d0e0f, relocInfo::none, 0 );
2153    return start;
2154  }
2155
2156  // Utility routine for loading a 128-bit key word in little endian format
2157  // can optionally specify that the shuffle mask is already in an xmmregister
2158  void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2159    __ movdqu(xmmdst, Address(key, offset));
2160    if (xmm_shuf_mask != NULL) {
2161      __ pshufb(xmmdst, xmm_shuf_mask);
2162    } else {
2163      __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2164    }
2165  }
2166
2167  // aesenc using specified key+offset
2168  // can optionally specify that the shuffle mask is already in an xmmregister
2169  void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2170    load_key(xmmtmp, key, offset, xmm_shuf_mask);
2171    __ aesenc(xmmdst, xmmtmp);
2172  }
2173
2174  // aesdec using specified key+offset
2175  // can optionally specify that the shuffle mask is already in an xmmregister
2176  void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2177    load_key(xmmtmp, key, offset, xmm_shuf_mask);
2178    __ aesdec(xmmdst, xmmtmp);
2179  }
2180
2181
2182  // Arguments:
2183  //
2184  // Inputs:
2185  //   c_rarg0   - source byte array address
2186  //   c_rarg1   - destination byte array address
2187  //   c_rarg2   - K (key) in little endian int array
2188  //
2189  address generate_aescrypt_encryptBlock() {
2190    assert(UseAES, "need AES instructions and misaligned SSE support");
2191    __ align(CodeEntryAlignment);
2192    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2193    Label L_doLast;
2194    address start = __ pc();
2195
2196    const Register from        = rdx;      // source array address
2197    const Register to          = rdx;      // destination array address
2198    const Register key         = rcx;      // key array address
2199    const Register keylen      = rax;
2200    const Address  from_param(rbp, 8+0);
2201    const Address  to_param  (rbp, 8+4);
2202    const Address  key_param (rbp, 8+8);
2203
2204    const XMMRegister xmm_result = xmm0;
2205    const XMMRegister xmm_key_shuf_mask = xmm1;
2206    const XMMRegister xmm_temp1  = xmm2;
2207    const XMMRegister xmm_temp2  = xmm3;
2208    const XMMRegister xmm_temp3  = xmm4;
2209    const XMMRegister xmm_temp4  = xmm5;
2210
2211    __ enter();   // required for proper stackwalking of RuntimeStub frame
2212    __ movptr(from, from_param);
2213    __ movptr(key, key_param);
2214
2215    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2216    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2217
2218    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2219    __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
2220    __ movptr(to, to_param);
2221
2222    // For encryption, the java expanded key ordering is just what we need
2223
2224    load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
2225    __ pxor(xmm_result, xmm_temp1);
2226
2227    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2228    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2229    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2230    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2231
2232    __ aesenc(xmm_result, xmm_temp1);
2233    __ aesenc(xmm_result, xmm_temp2);
2234    __ aesenc(xmm_result, xmm_temp3);
2235    __ aesenc(xmm_result, xmm_temp4);
2236
2237    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2238    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2239    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2240    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2241
2242    __ aesenc(xmm_result, xmm_temp1);
2243    __ aesenc(xmm_result, xmm_temp2);
2244    __ aesenc(xmm_result, xmm_temp3);
2245    __ aesenc(xmm_result, xmm_temp4);
2246
2247    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2248    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2249
2250    __ cmpl(keylen, 44);
2251    __ jccb(Assembler::equal, L_doLast);
2252
2253    __ aesenc(xmm_result, xmm_temp1);
2254    __ aesenc(xmm_result, xmm_temp2);
2255
2256    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2257    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2258
2259    __ cmpl(keylen, 52);
2260    __ jccb(Assembler::equal, L_doLast);
2261
2262    __ aesenc(xmm_result, xmm_temp1);
2263    __ aesenc(xmm_result, xmm_temp2);
2264
2265    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2266    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2267
2268    __ BIND(L_doLast);
2269    __ aesenc(xmm_result, xmm_temp1);
2270    __ aesenclast(xmm_result, xmm_temp2);
2271    __ movdqu(Address(to, 0), xmm_result);        // store the result
2272    __ xorptr(rax, rax); // return 0
2273    __ leave(); // required for proper stackwalking of RuntimeStub frame
2274    __ ret(0);
2275
2276    return start;
2277  }
2278
2279
2280  // Arguments:
2281  //
2282  // Inputs:
2283  //   c_rarg0   - source byte array address
2284  //   c_rarg1   - destination byte array address
2285  //   c_rarg2   - K (key) in little endian int array
2286  //
2287  address generate_aescrypt_decryptBlock() {
2288    assert(UseAES, "need AES instructions and misaligned SSE support");
2289    __ align(CodeEntryAlignment);
2290    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2291    Label L_doLast;
2292    address start = __ pc();
2293
2294    const Register from        = rdx;      // source array address
2295    const Register to          = rdx;      // destination array address
2296    const Register key         = rcx;      // key array address
2297    const Register keylen      = rax;
2298    const Address  from_param(rbp, 8+0);
2299    const Address  to_param  (rbp, 8+4);
2300    const Address  key_param (rbp, 8+8);
2301
2302    const XMMRegister xmm_result = xmm0;
2303    const XMMRegister xmm_key_shuf_mask = xmm1;
2304    const XMMRegister xmm_temp1  = xmm2;
2305    const XMMRegister xmm_temp2  = xmm3;
2306    const XMMRegister xmm_temp3  = xmm4;
2307    const XMMRegister xmm_temp4  = xmm5;
2308
2309    __ enter(); // required for proper stackwalking of RuntimeStub frame
2310    __ movptr(from, from_param);
2311    __ movptr(key, key_param);
2312
2313    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2314    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2315
2316    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2317    __ movdqu(xmm_result, Address(from, 0));
2318    __ movptr(to, to_param);
2319
2320    // for decryption java expanded key ordering is rotated one position from what we want
2321    // so we start from 0x10 here and hit 0x00 last
2322    // we don't know if the key is aligned, hence not using load-execute form
2323    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2324    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2325    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2326    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2327
2328    __ pxor  (xmm_result, xmm_temp1);
2329    __ aesdec(xmm_result, xmm_temp2);
2330    __ aesdec(xmm_result, xmm_temp3);
2331    __ aesdec(xmm_result, xmm_temp4);
2332
2333    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2334    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2335    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2336    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2337
2338    __ aesdec(xmm_result, xmm_temp1);
2339    __ aesdec(xmm_result, xmm_temp2);
2340    __ aesdec(xmm_result, xmm_temp3);
2341    __ aesdec(xmm_result, xmm_temp4);
2342
2343    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2344    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2345    load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
2346
2347    __ cmpl(keylen, 44);
2348    __ jccb(Assembler::equal, L_doLast);
2349
2350    __ aesdec(xmm_result, xmm_temp1);
2351    __ aesdec(xmm_result, xmm_temp2);
2352
2353    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2354    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2355
2356    __ cmpl(keylen, 52);
2357    __ jccb(Assembler::equal, L_doLast);
2358
2359    __ aesdec(xmm_result, xmm_temp1);
2360    __ aesdec(xmm_result, xmm_temp2);
2361
2362    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2363    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2364
2365    __ BIND(L_doLast);
2366    __ aesdec(xmm_result, xmm_temp1);
2367    __ aesdec(xmm_result, xmm_temp2);
2368
2369    // for decryption the aesdeclast operation is always on key+0x00
2370    __ aesdeclast(xmm_result, xmm_temp3);
2371    __ movdqu(Address(to, 0), xmm_result);  // store the result
2372    __ xorptr(rax, rax); // return 0
2373    __ leave(); // required for proper stackwalking of RuntimeStub frame
2374    __ ret(0);
2375
2376    return start;
2377  }
2378
2379  void handleSOERegisters(bool saving) {
2380    const int saveFrameSizeInBytes = 4 * wordSize;
2381    const Address saved_rbx     (rbp, -3 * wordSize);
2382    const Address saved_rsi     (rbp, -2 * wordSize);
2383    const Address saved_rdi     (rbp, -1 * wordSize);
2384
2385    if (saving) {
2386      __ subptr(rsp, saveFrameSizeInBytes);
2387      __ movptr(saved_rsi, rsi);
2388      __ movptr(saved_rdi, rdi);
2389      __ movptr(saved_rbx, rbx);
2390    } else {
2391      // restoring
2392      __ movptr(rsi, saved_rsi);
2393      __ movptr(rdi, saved_rdi);
2394      __ movptr(rbx, saved_rbx);
2395    }
2396  }
2397
2398  // Arguments:
2399  //
2400  // Inputs:
2401  //   c_rarg0   - source byte array address
2402  //   c_rarg1   - destination byte array address
2403  //   c_rarg2   - K (key) in little endian int array
2404  //   c_rarg3   - r vector byte array address
2405  //   c_rarg4   - input length
2406  //
2407  address generate_cipherBlockChaining_encryptAESCrypt() {
2408    assert(UseAES, "need AES instructions and misaligned SSE support");
2409    __ align(CodeEntryAlignment);
2410    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2411    address start = __ pc();
2412
2413    Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
2414    const Register from        = rsi;      // source array address
2415    const Register to          = rdx;      // destination array address
2416    const Register key         = rcx;      // key array address
2417    const Register rvec        = rdi;      // r byte array initialized from initvector array address
2418                                           // and left with the results of the last encryption block
2419    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
2420    const Register pos         = rax;
2421
2422    // xmm register assignments for the loops below
2423    const XMMRegister xmm_result = xmm0;
2424    const XMMRegister xmm_temp   = xmm1;
2425    // first 6 keys preloaded into xmm2-xmm7
2426    const int XMM_REG_NUM_KEY_FIRST = 2;
2427    const int XMM_REG_NUM_KEY_LAST  = 7;
2428    const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
2429
2430    __ enter(); // required for proper stackwalking of RuntimeStub frame
2431    handleSOERegisters(true /*saving*/);
2432
2433    // load registers from incoming parameters
2434    const Address  from_param(rbp, 8+0);
2435    const Address  to_param  (rbp, 8+4);
2436    const Address  key_param (rbp, 8+8);
2437    const Address  rvec_param (rbp, 8+12);
2438    const Address  len_param  (rbp, 8+16);
2439    __ movptr(from , from_param);
2440    __ movptr(to   , to_param);
2441    __ movptr(key  , key_param);
2442    __ movptr(rvec , rvec_param);
2443    __ movptr(len_reg , len_param);
2444
2445    const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
2446    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2447    // load up xmm regs 2 thru 7 with keys 0-5
2448    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2449      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
2450      offset += 0x10;
2451    }
2452
2453    __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
2454
2455    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2456    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2457    __ cmpl(rax, 44);
2458    __ jcc(Assembler::notEqual, L_key_192_256);
2459
2460    // 128 bit code follows here
2461    __ movl(pos, 0);
2462    __ align(OptoLoopAlignment);
2463    __ BIND(L_loopTop_128);
2464    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2465    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2466
2467    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2468    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2469      __ aesenc(xmm_result, as_XMMRegister(rnum));
2470    }
2471    for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) {
2472      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2473    }
2474    load_key(xmm_temp, key, 0xa0);
2475    __ aesenclast(xmm_result, xmm_temp);
2476
2477    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
2478    // no need to store r to memory until we exit
2479    __ addptr(pos, AESBlockSize);
2480    __ subptr(len_reg, AESBlockSize);
2481    __ jcc(Assembler::notEqual, L_loopTop_128);
2482
2483    __ BIND(L_exit);
2484    __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
2485
2486    handleSOERegisters(false /*restoring*/);
2487    __ movl(rax, 0);                             // return 0 (why?)
2488    __ leave();                                  // required for proper stackwalking of RuntimeStub frame
2489    __ ret(0);
2490
2491    __ BIND(L_key_192_256);
2492    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
2493    __ cmpl(rax, 52);
2494    __ jcc(Assembler::notEqual, L_key_256);
2495
2496    // 192-bit code follows here (could be changed to use more xmm registers)
2497    __ movl(pos, 0);
2498    __ align(OptoLoopAlignment);
2499    __ BIND(L_loopTop_192);
2500    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2501    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2502
2503    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2504    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2505      __ aesenc(xmm_result, as_XMMRegister(rnum));
2506    }
2507    for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) {
2508      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2509    }
2510    load_key(xmm_temp, key, 0xc0);
2511    __ aesenclast(xmm_result, xmm_temp);
2512
2513    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
2514    // no need to store r to memory until we exit
2515    __ addptr(pos, AESBlockSize);
2516    __ subptr(len_reg, AESBlockSize);
2517    __ jcc(Assembler::notEqual, L_loopTop_192);
2518    __ jmp(L_exit);
2519
2520    __ BIND(L_key_256);
2521    // 256-bit code follows here (could be changed to use more xmm registers)
2522    __ movl(pos, 0);
2523    __ align(OptoLoopAlignment);
2524    __ BIND(L_loopTop_256);
2525    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2526    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2527
2528    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2529    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2530      __ aesenc(xmm_result, as_XMMRegister(rnum));
2531    }
2532    for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) {
2533      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2534    }
2535    load_key(xmm_temp, key, 0xe0);
2536    __ aesenclast(xmm_result, xmm_temp);
2537
2538    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
2539    // no need to store r to memory until we exit
2540    __ addptr(pos, AESBlockSize);
2541    __ subptr(len_reg, AESBlockSize);
2542    __ jcc(Assembler::notEqual, L_loopTop_256);
2543    __ jmp(L_exit);
2544
2545    return start;
2546  }
2547
2548
2549  // CBC AES Decryption.
2550  // In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time.
2551  //
2552  // Arguments:
2553  //
2554  // Inputs:
2555  //   c_rarg0   - source byte array address
2556  //   c_rarg1   - destination byte array address
2557  //   c_rarg2   - K (key) in little endian int array
2558  //   c_rarg3   - r vector byte array address
2559  //   c_rarg4   - input length
2560  //
2561
2562  address generate_cipherBlockChaining_decryptAESCrypt() {
2563    assert(UseAES, "need AES instructions and misaligned SSE support");
2564    __ align(CodeEntryAlignment);
2565    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2566    address start = __ pc();
2567
2568    Label L_exit, L_key_192_256, L_key_256;
2569    Label L_singleBlock_loopTop_128;
2570    Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
2571    const Register from        = rsi;      // source array address
2572    const Register to          = rdx;      // destination array address
2573    const Register key         = rcx;      // key array address
2574    const Register rvec        = rdi;      // r byte array initialized from initvector array address
2575                                           // and left with the results of the last encryption block
2576    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
2577    const Register pos         = rax;
2578
2579    // xmm register assignments for the loops below
2580    const XMMRegister xmm_result = xmm0;
2581    const XMMRegister xmm_temp   = xmm1;
2582    // first 6 keys preloaded into xmm2-xmm7
2583    const int XMM_REG_NUM_KEY_FIRST = 2;
2584    const int XMM_REG_NUM_KEY_LAST  = 7;
2585    const int FIRST_NON_REG_KEY_offset = 0x70;
2586    const XMMRegister xmm_key_first   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
2587
2588    __ enter(); // required for proper stackwalking of RuntimeStub frame
2589    handleSOERegisters(true /*saving*/);
2590
2591    // load registers from incoming parameters
2592    const Address  from_param(rbp, 8+0);
2593    const Address  to_param  (rbp, 8+4);
2594    const Address  key_param (rbp, 8+8);
2595    const Address  rvec_param (rbp, 8+12);
2596    const Address  len_param  (rbp, 8+16);
2597    __ movptr(from , from_param);
2598    __ movptr(to   , to_param);
2599    __ movptr(key  , key_param);
2600    __ movptr(rvec , rvec_param);
2601    __ movptr(len_reg , len_param);
2602
2603    // the java expanded key ordering is rotated one position from what we want
2604    // so we start from 0x10 here and hit 0x00 last
2605    const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
2606    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2607    // load up xmm regs 2 thru 6 with first 5 keys
2608    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2609      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
2610      offset += 0x10;
2611    }
2612
2613    // inside here, use the rvec register to point to previous block cipher
2614    // with which we xor at the end of each newly decrypted block
2615    const Register  prev_block_cipher_ptr = rvec;
2616
2617    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2618    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2619    __ cmpl(rax, 44);
2620    __ jcc(Assembler::notEqual, L_key_192_256);
2621
2622
2623    // 128-bit code follows here, parallelized
2624    __ movl(pos, 0);
2625    __ align(OptoLoopAlignment);
2626    __ BIND(L_singleBlock_loopTop_128);
2627    __ cmpptr(len_reg, 0);           // any blocks left??
2628    __ jcc(Assembler::equal, L_exit);
2629    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
2630    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
2631    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2632      __ aesdec(xmm_result, as_XMMRegister(rnum));
2633    }
2634    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xa0; key_offset += 0x10) {   // 128-bit runs up to key offset a0
2635      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
2636    }
2637    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
2638    __ aesdeclast(xmm_result, xmm_temp);
2639    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
2640    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
2641    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
2642    // no need to store r to memory until we exit
2643    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
2644    __ addptr(pos, AESBlockSize);
2645    __ subptr(len_reg, AESBlockSize);
2646    __ jmp(L_singleBlock_loopTop_128);
2647
2648
2649    __ BIND(L_exit);
2650    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
2651    __ movptr(rvec , rvec_param);                                     // restore this since used in loop
2652    __ movdqu(Address(rvec, 0), xmm_temp);                            // final value of r stored in rvec of CipherBlockChaining object
2653    handleSOERegisters(false /*restoring*/);
2654    __ movl(rax, 0);                                                  // return 0 (why?)
2655    __ leave();                                                       // required for proper stackwalking of RuntimeStub frame
2656    __ ret(0);
2657
2658
2659    __ BIND(L_key_192_256);
2660    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
2661    __ cmpl(rax, 52);
2662    __ jcc(Assembler::notEqual, L_key_256);
2663
2664    // 192-bit code follows here (could be optimized to use parallelism)
2665    __ movl(pos, 0);
2666    __ align(OptoLoopAlignment);
2667    __ BIND(L_singleBlock_loopTop_192);
2668    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
2669    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
2670    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2671      __ aesdec(xmm_result, as_XMMRegister(rnum));
2672    }
2673    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xc0; key_offset += 0x10) {   // 192-bit runs up to key offset c0
2674      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
2675    }
2676    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
2677    __ aesdeclast(xmm_result, xmm_temp);
2678    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
2679    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
2680    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
2681    // no need to store r to memory until we exit
2682    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
2683    __ addptr(pos, AESBlockSize);
2684    __ subptr(len_reg, AESBlockSize);
2685    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
2686    __ jmp(L_exit);
2687
2688    __ BIND(L_key_256);
2689    // 256-bit code follows here (could be optimized to use parallelism)
2690    __ movl(pos, 0);
2691    __ align(OptoLoopAlignment);
2692    __ BIND(L_singleBlock_loopTop_256);
2693    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
2694    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
2695    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2696      __ aesdec(xmm_result, as_XMMRegister(rnum));
2697    }
2698    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xe0; key_offset += 0x10) {   // 256-bit runs up to key offset e0
2699      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
2700    }
2701    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
2702    __ aesdeclast(xmm_result, xmm_temp);
2703    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
2704    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
2705    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
2706    // no need to store r to memory until we exit
2707    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
2708    __ addptr(pos, AESBlockSize);
2709    __ subptr(len_reg, AESBlockSize);
2710    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
2711    __ jmp(L_exit);
2712
2713    return start;
2714  }
2715
2716  /**
2717   *  Arguments:
2718   *
2719   * Inputs:
2720   *   rsp(4)   - int crc
2721   *   rsp(8)   - byte* buf
2722   *   rsp(12)  - int length
2723   *
2724   * Ouput:
2725   *       rax   - int crc result
2726   */
2727  address generate_updateBytesCRC32() {
2728    assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
2729
2730    __ align(CodeEntryAlignment);
2731    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
2732
2733    address start = __ pc();
2734
2735    const Register crc   = rdx;  // crc
2736    const Register buf   = rsi;  // source java byte array address
2737    const Register len   = rcx;  // length
2738    const Register table = rdi;  // crc_table address (reuse register)
2739    const Register tmp   = rbx;
2740    assert_different_registers(crc, buf, len, table, tmp, rax);
2741
2742    BLOCK_COMMENT("Entry:");
2743    __ enter(); // required for proper stackwalking of RuntimeStub frame
2744    __ push(rsi);
2745    __ push(rdi);
2746    __ push(rbx);
2747
2748    Address crc_arg(rbp, 8 + 0);
2749    Address buf_arg(rbp, 8 + 4);
2750    Address len_arg(rbp, 8 + 8);
2751
2752    // Load up:
2753    __ movl(crc,   crc_arg);
2754    __ movptr(buf, buf_arg);
2755    __ movl(len,   len_arg);
2756
2757    __ kernel_crc32(crc, buf, len, table, tmp);
2758
2759    __ movl(rax, crc);
2760    __ pop(rbx);
2761    __ pop(rdi);
2762    __ pop(rsi);
2763    __ leave(); // required for proper stackwalking of RuntimeStub frame
2764    __ ret(0);
2765
2766    return start;
2767  }
2768
2769  // Safefetch stubs.
2770  void generate_safefetch(const char* name, int size, address* entry,
2771                          address* fault_pc, address* continuation_pc) {
2772    // safefetch signatures:
2773    //   int      SafeFetch32(int*      adr, int      errValue);
2774    //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2775
2776    StubCodeMark mark(this, "StubRoutines", name);
2777
2778    // Entry point, pc or function descriptor.
2779    *entry = __ pc();
2780
2781    __ movl(rax, Address(rsp, 0x8));
2782    __ movl(rcx, Address(rsp, 0x4));
2783    // Load *adr into eax, may fault.
2784    *fault_pc = __ pc();
2785    switch (size) {
2786      case 4:
2787        // int32_t
2788        __ movl(rax, Address(rcx, 0));
2789        break;
2790      case 8:
2791        // int64_t
2792        Unimplemented();
2793        break;
2794      default:
2795        ShouldNotReachHere();
2796    }
2797
2798    // Return errValue or *adr.
2799    *continuation_pc = __ pc();
2800    __ ret(0);
2801  }
2802
2803 public:
2804  // Information about frame layout at time of blocking runtime call.
2805  // Note that we only have to preserve callee-saved registers since
2806  // the compilers are responsible for supplying a continuation point
2807  // if they expect all registers to be preserved.
2808  enum layout {
2809    thread_off,    // last_java_sp
2810    arg1_off,
2811    arg2_off,
2812    rbp_off,       // callee saved register
2813    ret_pc,
2814    framesize
2815  };
2816
2817 private:
2818
2819#undef  __
2820#define __ masm->
2821
2822  //------------------------------------------------------------------------------------------------------------------------
2823  // Continuation point for throwing of implicit exceptions that are not handled in
2824  // the current activation. Fabricates an exception oop and initiates normal
2825  // exception dispatching in this frame.
2826  //
2827  // Previously the compiler (c2) allowed for callee save registers on Java calls.
2828  // This is no longer true after adapter frames were removed but could possibly
2829  // be brought back in the future if the interpreter code was reworked and it
2830  // was deemed worthwhile. The comment below was left to describe what must
2831  // happen here if callee saves were resurrected. As it stands now this stub
2832  // could actually be a vanilla BufferBlob and have now oopMap at all.
2833  // Since it doesn't make much difference we've chosen to leave it the
2834  // way it was in the callee save days and keep the comment.
2835
2836  // If we need to preserve callee-saved values we need a callee-saved oop map and
2837  // therefore have to make these stubs into RuntimeStubs rather than BufferBlobs.
2838  // If the compiler needs all registers to be preserved between the fault
2839  // point and the exception handler then it must assume responsibility for that in
2840  // AbstractCompiler::continuation_for_implicit_null_exception or
2841  // continuation_for_implicit_division_by_zero_exception. All other implicit
2842  // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
2843  // either at call sites or otherwise assume that stack unwinding will be initiated,
2844  // so caller saved registers were assumed volatile in the compiler.
2845  address generate_throw_exception(const char* name, address runtime_entry,
2846                                   Register arg1 = noreg, Register arg2 = noreg) {
2847
2848    int insts_size = 256;
2849    int locs_size  = 32;
2850
2851    CodeBuffer code(name, insts_size, locs_size);
2852    OopMapSet* oop_maps  = new OopMapSet();
2853    MacroAssembler* masm = new MacroAssembler(&code);
2854
2855    address start = __ pc();
2856
2857    // This is an inlined and slightly modified version of call_VM
2858    // which has the ability to fetch the return PC out of
2859    // thread-local storage and also sets up last_Java_sp slightly
2860    // differently than the real call_VM
2861    Register java_thread = rbx;
2862    __ get_thread(java_thread);
2863
2864    __ enter(); // required for proper stackwalking of RuntimeStub frame
2865
2866    // pc and rbp, already pushed
2867    __ subptr(rsp, (framesize-2) * wordSize); // prolog
2868
2869    // Frame is now completed as far as size and linkage.
2870
2871    int frame_complete = __ pc() - start;
2872
2873    // push java thread (becomes first argument of C function)
2874    __ movptr(Address(rsp, thread_off * wordSize), java_thread);
2875    if (arg1 != noreg) {
2876      __ movptr(Address(rsp, arg1_off * wordSize), arg1);
2877    }
2878    if (arg2 != noreg) {
2879      assert(arg1 != noreg, "missing reg arg");
2880      __ movptr(Address(rsp, arg2_off * wordSize), arg2);
2881    }
2882
2883    // Set up last_Java_sp and last_Java_fp
2884    __ set_last_Java_frame(java_thread, rsp, rbp, NULL);
2885
2886    // Call runtime
2887    BLOCK_COMMENT("call runtime_entry");
2888    __ call(RuntimeAddress(runtime_entry));
2889    // Generate oop map
2890    OopMap* map =  new OopMap(framesize, 0);
2891    oop_maps->add_gc_map(__ pc() - start, map);
2892
2893    // restore the thread (cannot use the pushed argument since arguments
2894    // may be overwritten by C code generated by an optimizing compiler);
2895    // however can use the register value directly if it is callee saved.
2896    __ get_thread(java_thread);
2897
2898    __ reset_last_Java_frame(java_thread, true, false);
2899
2900    __ leave(); // required for proper stackwalking of RuntimeStub frame
2901
2902    // check for pending exceptions
2903#ifdef ASSERT
2904    Label L;
2905    __ cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
2906    __ jcc(Assembler::notEqual, L);
2907    __ should_not_reach_here();
2908    __ bind(L);
2909#endif /* ASSERT */
2910    __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2911
2912
2913    RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, framesize, oop_maps, false);
2914    return stub->entry_point();
2915  }
2916
2917
2918  void create_control_words() {
2919    // Round to nearest, 53-bit mode, exceptions masked
2920    StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
2921    // Round to zero, 53-bit mode, exception mased
2922    StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
2923    // Round to nearest, 24-bit mode, exceptions masked
2924    StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
2925    // Round to nearest, 64-bit mode, exceptions masked
2926    StubRoutines::_fpu_cntrl_wrd_64    = 0x037F;
2927    // Round to nearest, 64-bit mode, exceptions masked
2928    StubRoutines::_mxcsr_std           = 0x1F80;
2929    // Note: the following two constants are 80-bit values
2930    //       layout is critical for correct loading by FPU.
2931    // Bias for strict fp multiply/divide
2932    StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
2933    StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
2934    StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
2935    // Un-Bias for strict fp multiply/divide
2936    StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
2937    StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
2938    StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
2939  }
2940
2941  //---------------------------------------------------------------------------
2942  // Initialization
2943
2944  void generate_initial() {
2945    // Generates all stubs and initializes the entry points
2946
2947    //------------------------------------------------------------------------------------------------------------------------
2948    // entry points that exist in all platforms
2949    // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
2950    //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
2951    StubRoutines::_forward_exception_entry      = generate_forward_exception();
2952
2953    StubRoutines::_call_stub_entry              =
2954      generate_call_stub(StubRoutines::_call_stub_return_address);
2955    // is referenced by megamorphic call
2956    StubRoutines::_catch_exception_entry        = generate_catch_exception();
2957
2958    // These are currently used by Solaris/Intel
2959    StubRoutines::_atomic_xchg_entry            = generate_atomic_xchg();
2960
2961    StubRoutines::_handler_for_unsafe_access_entry =
2962      generate_handler_for_unsafe_access();
2963
2964    // platform dependent
2965    create_control_words();
2966
2967    StubRoutines::x86::_verify_mxcsr_entry                 = generate_verify_mxcsr();
2968    StubRoutines::x86::_verify_fpu_cntrl_wrd_entry         = generate_verify_fpu_cntrl_wrd();
2969    StubRoutines::_d2i_wrapper                              = generate_d2i_wrapper(T_INT,
2970                                                                                   CAST_FROM_FN_PTR(address, SharedRuntime::d2i));
2971    StubRoutines::_d2l_wrapper                              = generate_d2i_wrapper(T_LONG,
2972                                                                                   CAST_FROM_FN_PTR(address, SharedRuntime::d2l));
2973
2974    // Build this early so it's available for the interpreter
2975    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
2976
2977    if (UseCRC32Intrinsics) {
2978      // set table address before stub generation which use it
2979      StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
2980      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
2981    }
2982  }
2983
2984
2985  void generate_all() {
2986    // Generates all stubs and initializes the entry points
2987
2988    // These entry points require SharedInfo::stack0 to be set up in non-core builds
2989    // and need to be relocatable, so they each fabricate a RuntimeStub internally.
2990    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
2991    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
2992    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
2993
2994    //------------------------------------------------------------------------------------------------------------------------
2995    // entry points that are platform specific
2996
2997    // support for verify_oop (must happen after universe_init)
2998    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
2999
3000    // arraycopy stubs used by compilers
3001    generate_arraycopy_stubs();
3002
3003    generate_math_stubs();
3004
3005    // don't bother generating these AES intrinsic stubs unless global flag is set
3006    if (UseAESIntrinsics) {
3007      StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // might be needed by the others
3008
3009      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
3010      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
3011      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
3012      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
3013    }
3014
3015    // Safefetch stubs.
3016    generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
3017                                                   &StubRoutines::_safefetch32_fault_pc,
3018                                                   &StubRoutines::_safefetch32_continuation_pc);
3019    StubRoutines::_safefetchN_entry           = StubRoutines::_safefetch32_entry;
3020    StubRoutines::_safefetchN_fault_pc        = StubRoutines::_safefetch32_fault_pc;
3021    StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc;
3022  }
3023
3024
3025 public:
3026  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3027    if (all) {
3028      generate_all();
3029    } else {
3030      generate_initial();
3031    }
3032  }
3033}; // end class declaration
3034
3035
3036void StubGenerator_generate(CodeBuffer* code, bool all) {
3037  StubGenerator g(code, all);
3038}
3039