stubGenerator_x86_32.cpp revision 8030:1ee6085d2a83
1/*
2 * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "asm/macroAssembler.hpp"
27#include "asm/macroAssembler.inline.hpp"
28#include "interpreter/interpreter.hpp"
29#include "nativeInst_x86.hpp"
30#include "oops/instanceOop.hpp"
31#include "oops/method.hpp"
32#include "oops/objArrayKlass.hpp"
33#include "oops/oop.inline.hpp"
34#include "prims/methodHandles.hpp"
35#include "runtime/frame.inline.hpp"
36#include "runtime/handles.inline.hpp"
37#include "runtime/sharedRuntime.hpp"
38#include "runtime/stubCodeGenerator.hpp"
39#include "runtime/stubRoutines.hpp"
40#include "runtime/thread.inline.hpp"
41#include "utilities/top.hpp"
42#ifdef COMPILER2
43#include "opto/runtime.hpp"
44#endif
45
46// Declaration and definition of StubGenerator (no .hpp file).
47// For a more detailed description of the stub routine structure
48// see the comment in stubRoutines.hpp
49
50#define __ _masm->
51#define a__ ((Assembler*)_masm)->
52
53#ifdef PRODUCT
54#define BLOCK_COMMENT(str) /* nothing */
55#else
56#define BLOCK_COMMENT(str) __ block_comment(str)
57#endif
58
59#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
60
61const int MXCSR_MASK  = 0xFFC0;  // Mask out any pending exceptions
62const int FPU_CNTRL_WRD_MASK = 0xFFFF;
63
64// -------------------------------------------------------------------------------------------------------------------------
65// Stub Code definitions
66
67static address handle_unsafe_access() {
68  JavaThread* thread = JavaThread::current();
69  address pc  = thread->saved_exception_pc();
70  // pc is the instruction which we must emulate
71  // doing a no-op is fine:  return garbage from the load
72  // therefore, compute npc
73  address npc = Assembler::locate_next_instruction(pc);
74
75  // request an async exception
76  thread->set_pending_unsafe_access_error();
77
78  // return address of next instruction to execute
79  return npc;
80}
81
82class StubGenerator: public StubCodeGenerator {
83 private:
84
85#ifdef PRODUCT
86#define inc_counter_np(counter) ((void)0)
87#else
88  void inc_counter_np_(int& counter) {
89    __ incrementl(ExternalAddress((address)&counter));
90  }
91#define inc_counter_np(counter) \
92  BLOCK_COMMENT("inc_counter " #counter); \
93  inc_counter_np_(counter);
94#endif //PRODUCT
95
96  void inc_copy_counter_np(BasicType t) {
97#ifndef PRODUCT
98    switch (t) {
99    case T_BYTE:    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); return;
100    case T_SHORT:   inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); return;
101    case T_INT:     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); return;
102    case T_LONG:    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); return;
103    case T_OBJECT:  inc_counter_np(SharedRuntime::_oop_array_copy_ctr); return;
104    }
105    ShouldNotReachHere();
106#endif //PRODUCT
107  }
108
109  //------------------------------------------------------------------------------------------------------------------------
110  // Call stubs are used to call Java from C
111  //
112  //    [ return_from_Java     ] <--- rsp
113  //    [ argument word n      ]
114  //      ...
115  // -N [ argument word 1      ]
116  // -7 [ Possible padding for stack alignment ]
117  // -6 [ Possible padding for stack alignment ]
118  // -5 [ Possible padding for stack alignment ]
119  // -4 [ mxcsr save           ] <--- rsp_after_call
120  // -3 [ saved rbx,            ]
121  // -2 [ saved rsi            ]
122  // -1 [ saved rdi            ]
123  //  0 [ saved rbp,            ] <--- rbp,
124  //  1 [ return address       ]
125  //  2 [ ptr. to call wrapper ]
126  //  3 [ result               ]
127  //  4 [ result_type          ]
128  //  5 [ method               ]
129  //  6 [ entry_point          ]
130  //  7 [ parameters           ]
131  //  8 [ parameter_size       ]
132  //  9 [ thread               ]
133
134
135  address generate_call_stub(address& return_address) {
136    StubCodeMark mark(this, "StubRoutines", "call_stub");
137    address start = __ pc();
138
139    // stub code parameters / addresses
140    assert(frame::entry_frame_call_wrapper_offset == 2, "adjust this code");
141    bool  sse_save = false;
142    const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_catch_exception()!
143    const int     locals_count_in_bytes  (4*wordSize);
144    const Address mxcsr_save    (rbp, -4 * wordSize);
145    const Address saved_rbx     (rbp, -3 * wordSize);
146    const Address saved_rsi     (rbp, -2 * wordSize);
147    const Address saved_rdi     (rbp, -1 * wordSize);
148    const Address result        (rbp,  3 * wordSize);
149    const Address result_type   (rbp,  4 * wordSize);
150    const Address method        (rbp,  5 * wordSize);
151    const Address entry_point   (rbp,  6 * wordSize);
152    const Address parameters    (rbp,  7 * wordSize);
153    const Address parameter_size(rbp,  8 * wordSize);
154    const Address thread        (rbp,  9 * wordSize); // same as in generate_catch_exception()!
155    sse_save =  UseSSE > 0;
156
157    // stub code
158    __ enter();
159    __ movptr(rcx, parameter_size);              // parameter counter
160    __ shlptr(rcx, Interpreter::logStackElementSize); // convert parameter count to bytes
161    __ addptr(rcx, locals_count_in_bytes);       // reserve space for register saves
162    __ subptr(rsp, rcx);
163    __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
164
165    // save rdi, rsi, & rbx, according to C calling conventions
166    __ movptr(saved_rdi, rdi);
167    __ movptr(saved_rsi, rsi);
168    __ movptr(saved_rbx, rbx);
169    // save and initialize %mxcsr
170    if (sse_save) {
171      Label skip_ldmx;
172      __ stmxcsr(mxcsr_save);
173      __ movl(rax, mxcsr_save);
174      __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
175      ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
176      __ cmp32(rax, mxcsr_std);
177      __ jcc(Assembler::equal, skip_ldmx);
178      __ ldmxcsr(mxcsr_std);
179      __ bind(skip_ldmx);
180    }
181
182    // make sure the control word is correct.
183    __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
184
185#ifdef ASSERT
186    // make sure we have no pending exceptions
187    { Label L;
188      __ movptr(rcx, thread);
189      __ cmpptr(Address(rcx, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
190      __ jcc(Assembler::equal, L);
191      __ stop("StubRoutines::call_stub: entered with pending exception");
192      __ bind(L);
193    }
194#endif
195
196    // pass parameters if any
197    BLOCK_COMMENT("pass parameters if any");
198    Label parameters_done;
199    __ movl(rcx, parameter_size);  // parameter counter
200    __ testl(rcx, rcx);
201    __ jcc(Assembler::zero, parameters_done);
202
203    // parameter passing loop
204
205    Label loop;
206    // Copy Java parameters in reverse order (receiver last)
207    // Note that the argument order is inverted in the process
208    // source is rdx[rcx: N-1..0]
209    // dest   is rsp[rbx: 0..N-1]
210
211    __ movptr(rdx, parameters);          // parameter pointer
212    __ xorptr(rbx, rbx);
213
214    __ BIND(loop);
215
216    // get parameter
217    __ movptr(rax, Address(rdx, rcx, Interpreter::stackElementScale(), -wordSize));
218    __ movptr(Address(rsp, rbx, Interpreter::stackElementScale(),
219                    Interpreter::expr_offset_in_bytes(0)), rax);          // store parameter
220    __ increment(rbx);
221    __ decrement(rcx);
222    __ jcc(Assembler::notZero, loop);
223
224    // call Java function
225    __ BIND(parameters_done);
226    __ movptr(rbx, method);           // get Method*
227    __ movptr(rax, entry_point);      // get entry_point
228    __ mov(rsi, rsp);                 // set sender sp
229    BLOCK_COMMENT("call Java function");
230    __ call(rax);
231
232    BLOCK_COMMENT("call_stub_return_address:");
233    return_address = __ pc();
234
235#ifdef COMPILER2
236    {
237      Label L_skip;
238      if (UseSSE >= 2) {
239        __ verify_FPU(0, "call_stub_return");
240      } else {
241        for (int i = 1; i < 8; i++) {
242          __ ffree(i);
243        }
244
245        // UseSSE <= 1 so double result should be left on TOS
246        __ movl(rsi, result_type);
247        __ cmpl(rsi, T_DOUBLE);
248        __ jcc(Assembler::equal, L_skip);
249        if (UseSSE == 0) {
250          // UseSSE == 0 so float result should be left on TOS
251          __ cmpl(rsi, T_FLOAT);
252          __ jcc(Assembler::equal, L_skip);
253        }
254        __ ffree(0);
255      }
256      __ BIND(L_skip);
257    }
258#endif // COMPILER2
259
260    // store result depending on type
261    // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
262    __ movptr(rdi, result);
263    Label is_long, is_float, is_double, exit;
264    __ movl(rsi, result_type);
265    __ cmpl(rsi, T_LONG);
266    __ jcc(Assembler::equal, is_long);
267    __ cmpl(rsi, T_FLOAT);
268    __ jcc(Assembler::equal, is_float);
269    __ cmpl(rsi, T_DOUBLE);
270    __ jcc(Assembler::equal, is_double);
271
272    // handle T_INT case
273    __ movl(Address(rdi, 0), rax);
274    __ BIND(exit);
275
276    // check that FPU stack is empty
277    __ verify_FPU(0, "generate_call_stub");
278
279    // pop parameters
280    __ lea(rsp, rsp_after_call);
281
282    // restore %mxcsr
283    if (sse_save) {
284      __ ldmxcsr(mxcsr_save);
285    }
286
287    // restore rdi, rsi and rbx,
288    __ movptr(rbx, saved_rbx);
289    __ movptr(rsi, saved_rsi);
290    __ movptr(rdi, saved_rdi);
291    __ addptr(rsp, 4*wordSize);
292
293    // return
294    __ pop(rbp);
295    __ ret(0);
296
297    // handle return types different from T_INT
298    __ BIND(is_long);
299    __ movl(Address(rdi, 0 * wordSize), rax);
300    __ movl(Address(rdi, 1 * wordSize), rdx);
301    __ jmp(exit);
302
303    __ BIND(is_float);
304    // interpreter uses xmm0 for return values
305    if (UseSSE >= 1) {
306      __ movflt(Address(rdi, 0), xmm0);
307    } else {
308      __ fstp_s(Address(rdi, 0));
309    }
310    __ jmp(exit);
311
312    __ BIND(is_double);
313    // interpreter uses xmm0 for return values
314    if (UseSSE >= 2) {
315      __ movdbl(Address(rdi, 0), xmm0);
316    } else {
317      __ fstp_d(Address(rdi, 0));
318    }
319    __ jmp(exit);
320
321    return start;
322  }
323
324
325  //------------------------------------------------------------------------------------------------------------------------
326  // Return point for a Java call if there's an exception thrown in Java code.
327  // The exception is caught and transformed into a pending exception stored in
328  // JavaThread that can be tested from within the VM.
329  //
330  // Note: Usually the parameters are removed by the callee. In case of an exception
331  //       crossing an activation frame boundary, that is not the case if the callee
332  //       is compiled code => need to setup the rsp.
333  //
334  // rax,: exception oop
335
336  address generate_catch_exception() {
337    StubCodeMark mark(this, "StubRoutines", "catch_exception");
338    const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_call_stub()!
339    const Address thread        (rbp,  9 * wordSize); // same as in generate_call_stub()!
340    address start = __ pc();
341
342    // get thread directly
343    __ movptr(rcx, thread);
344#ifdef ASSERT
345    // verify that threads correspond
346    { Label L;
347      __ get_thread(rbx);
348      __ cmpptr(rbx, rcx);
349      __ jcc(Assembler::equal, L);
350      __ stop("StubRoutines::catch_exception: threads must correspond");
351      __ bind(L);
352    }
353#endif
354    // set pending exception
355    __ verify_oop(rax);
356    __ movptr(Address(rcx, Thread::pending_exception_offset()), rax          );
357    __ lea(Address(rcx, Thread::exception_file_offset   ()),
358           ExternalAddress((address)__FILE__));
359    __ movl(Address(rcx, Thread::exception_line_offset   ()), __LINE__ );
360    // complete return to VM
361    assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
362    __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
363
364    return start;
365  }
366
367
368  //------------------------------------------------------------------------------------------------------------------------
369  // Continuation point for runtime calls returning with a pending exception.
370  // The pending exception check happened in the runtime or native call stub.
371  // The pending exception in Thread is converted into a Java-level exception.
372  //
373  // Contract with Java-level exception handlers:
374  // rax: exception
375  // rdx: throwing pc
376  //
377  // NOTE: At entry of this stub, exception-pc must be on stack !!
378
379  address generate_forward_exception() {
380    StubCodeMark mark(this, "StubRoutines", "forward exception");
381    address start = __ pc();
382    const Register thread = rcx;
383
384    // other registers used in this stub
385    const Register exception_oop = rax;
386    const Register handler_addr  = rbx;
387    const Register exception_pc  = rdx;
388
389    // Upon entry, the sp points to the return address returning into Java
390    // (interpreted or compiled) code; i.e., the return address becomes the
391    // throwing pc.
392    //
393    // Arguments pushed before the runtime call are still on the stack but
394    // the exception handler will reset the stack pointer -> ignore them.
395    // A potential result in registers can be ignored as well.
396
397#ifdef ASSERT
398    // make sure this code is only executed if there is a pending exception
399    { Label L;
400      __ get_thread(thread);
401      __ cmpptr(Address(thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
402      __ jcc(Assembler::notEqual, L);
403      __ stop("StubRoutines::forward exception: no pending exception (1)");
404      __ bind(L);
405    }
406#endif
407
408    // compute exception handler into rbx,
409    __ get_thread(thread);
410    __ movptr(exception_pc, Address(rsp, 0));
411    BLOCK_COMMENT("call exception_handler_for_return_address");
412    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, exception_pc);
413    __ mov(handler_addr, rax);
414
415    // setup rax & rdx, remove return address & clear pending exception
416    __ get_thread(thread);
417    __ pop(exception_pc);
418    __ movptr(exception_oop, Address(thread, Thread::pending_exception_offset()));
419    __ movptr(Address(thread, Thread::pending_exception_offset()), NULL_WORD);
420
421#ifdef ASSERT
422    // make sure exception is set
423    { Label L;
424      __ testptr(exception_oop, exception_oop);
425      __ jcc(Assembler::notEqual, L);
426      __ stop("StubRoutines::forward exception: no pending exception (2)");
427      __ bind(L);
428    }
429#endif
430
431    // Verify that there is really a valid exception in RAX.
432    __ verify_oop(exception_oop);
433
434    // continue at exception handler (return address removed)
435    // rax: exception
436    // rbx: exception handler
437    // rdx: throwing pc
438    __ jmp(handler_addr);
439
440    return start;
441  }
442
443
444  //----------------------------------------------------------------------------------------------------
445  // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest)
446  //
447  // xchg exists as far back as 8086, lock needed for MP only
448  // Stack layout immediately after call:
449  //
450  // 0 [ret addr ] <--- rsp
451  // 1 [  ex     ]
452  // 2 [  dest   ]
453  //
454  // Result:   *dest <- ex, return (old *dest)
455  //
456  // Note: win32 does not currently use this code
457
458  address generate_atomic_xchg() {
459    StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
460    address start = __ pc();
461
462    __ push(rdx);
463    Address exchange(rsp, 2 * wordSize);
464    Address dest_addr(rsp, 3 * wordSize);
465    __ movl(rax, exchange);
466    __ movptr(rdx, dest_addr);
467    __ xchgl(rax, Address(rdx, 0));
468    __ pop(rdx);
469    __ ret(0);
470
471    return start;
472  }
473
474  //----------------------------------------------------------------------------------------------------
475  // Support for void verify_mxcsr()
476  //
477  // This routine is used with -Xcheck:jni to verify that native
478  // JNI code does not return to Java code without restoring the
479  // MXCSR register to our expected state.
480
481
482  address generate_verify_mxcsr() {
483    StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
484    address start = __ pc();
485
486    const Address mxcsr_save(rsp, 0);
487
488    if (CheckJNICalls && UseSSE > 0 ) {
489      Label ok_ret;
490      ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
491      __ push(rax);
492      __ subptr(rsp, wordSize);      // allocate a temp location
493      __ stmxcsr(mxcsr_save);
494      __ movl(rax, mxcsr_save);
495      __ andl(rax, MXCSR_MASK);
496      __ cmp32(rax, mxcsr_std);
497      __ jcc(Assembler::equal, ok_ret);
498
499      __ warn("MXCSR changed by native JNI code.");
500
501      __ ldmxcsr(mxcsr_std);
502
503      __ bind(ok_ret);
504      __ addptr(rsp, wordSize);
505      __ pop(rax);
506    }
507
508    __ ret(0);
509
510    return start;
511  }
512
513
514  //---------------------------------------------------------------------------
515  // Support for void verify_fpu_cntrl_wrd()
516  //
517  // This routine is used with -Xcheck:jni to verify that native
518  // JNI code does not return to Java code without restoring the
519  // FP control word to our expected state.
520
521  address generate_verify_fpu_cntrl_wrd() {
522    StubCodeMark mark(this, "StubRoutines", "verify_spcw");
523    address start = __ pc();
524
525    const Address fpu_cntrl_wrd_save(rsp, 0);
526
527    if (CheckJNICalls) {
528      Label ok_ret;
529      __ push(rax);
530      __ subptr(rsp, wordSize);      // allocate a temp location
531      __ fnstcw(fpu_cntrl_wrd_save);
532      __ movl(rax, fpu_cntrl_wrd_save);
533      __ andl(rax, FPU_CNTRL_WRD_MASK);
534      ExternalAddress fpu_std(StubRoutines::addr_fpu_cntrl_wrd_std());
535      __ cmp32(rax, fpu_std);
536      __ jcc(Assembler::equal, ok_ret);
537
538      __ warn("Floating point control word changed by native JNI code.");
539
540      __ fldcw(fpu_std);
541
542      __ bind(ok_ret);
543      __ addptr(rsp, wordSize);
544      __ pop(rax);
545    }
546
547    __ ret(0);
548
549    return start;
550  }
551
552  //---------------------------------------------------------------------------
553  // Wrapper for slow-case handling of double-to-integer conversion
554  // d2i or f2i fast case failed either because it is nan or because
555  // of under/overflow.
556  // Input:  FPU TOS: float value
557  // Output: rax, (rdx): integer (long) result
558
559  address generate_d2i_wrapper(BasicType t, address fcn) {
560    StubCodeMark mark(this, "StubRoutines", "d2i_wrapper");
561    address start = __ pc();
562
563  // Capture info about frame layout
564  enum layout { FPUState_off         = 0,
565                rbp_off              = FPUStateSizeInWords,
566                rdi_off,
567                rsi_off,
568                rcx_off,
569                rbx_off,
570                saved_argument_off,
571                saved_argument_off2, // 2nd half of double
572                framesize
573  };
574
575  assert(FPUStateSizeInWords == 27, "update stack layout");
576
577    // Save outgoing argument to stack across push_FPU_state()
578    __ subptr(rsp, wordSize * 2);
579    __ fstp_d(Address(rsp, 0));
580
581    // Save CPU & FPU state
582    __ push(rbx);
583    __ push(rcx);
584    __ push(rsi);
585    __ push(rdi);
586    __ push(rbp);
587    __ push_FPU_state();
588
589    // push_FPU_state() resets the FP top of stack
590    // Load original double into FP top of stack
591    __ fld_d(Address(rsp, saved_argument_off * wordSize));
592    // Store double into stack as outgoing argument
593    __ subptr(rsp, wordSize*2);
594    __ fst_d(Address(rsp, 0));
595
596    // Prepare FPU for doing math in C-land
597    __ empty_FPU_stack();
598    // Call the C code to massage the double.  Result in EAX
599    if (t == T_INT)
600      { BLOCK_COMMENT("SharedRuntime::d2i"); }
601    else if (t == T_LONG)
602      { BLOCK_COMMENT("SharedRuntime::d2l"); }
603    __ call_VM_leaf( fcn, 2 );
604
605    // Restore CPU & FPU state
606    __ pop_FPU_state();
607    __ pop(rbp);
608    __ pop(rdi);
609    __ pop(rsi);
610    __ pop(rcx);
611    __ pop(rbx);
612    __ addptr(rsp, wordSize * 2);
613
614    __ ret(0);
615
616    return start;
617  }
618
619
620  //---------------------------------------------------------------------------
621  // The following routine generates a subroutine to throw an asynchronous
622  // UnknownError when an unsafe access gets a fault that could not be
623  // reasonably prevented by the programmer.  (Example: SIGBUS/OBJERR.)
624  address generate_handler_for_unsafe_access() {
625    StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
626    address start = __ pc();
627
628    __ push(0);                       // hole for return address-to-be
629    __ pusha();                       // push registers
630    Address next_pc(rsp, RegisterImpl::number_of_registers * BytesPerWord);
631    BLOCK_COMMENT("call handle_unsafe_access");
632    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, handle_unsafe_access)));
633    __ movptr(next_pc, rax);          // stuff next address
634    __ popa();
635    __ ret(0);                        // jump to next address
636
637    return start;
638  }
639
640
641  //----------------------------------------------------------------------------------------------------
642  // Non-destructive plausibility checks for oops
643
644  address generate_verify_oop() {
645    StubCodeMark mark(this, "StubRoutines", "verify_oop");
646    address start = __ pc();
647
648    // Incoming arguments on stack after saving rax,:
649    //
650    // [tos    ]: saved rdx
651    // [tos + 1]: saved EFLAGS
652    // [tos + 2]: return address
653    // [tos + 3]: char* error message
654    // [tos + 4]: oop   object to verify
655    // [tos + 5]: saved rax, - saved by caller and bashed
656
657    Label exit, error;
658    __ pushf();
659    __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
660    __ push(rdx);                                // save rdx
661    // make sure object is 'reasonable'
662    __ movptr(rax, Address(rsp, 4 * wordSize));    // get object
663    __ testptr(rax, rax);
664    __ jcc(Assembler::zero, exit);               // if obj is NULL it is ok
665
666    // Check if the oop is in the right area of memory
667    const int oop_mask = Universe::verify_oop_mask();
668    const int oop_bits = Universe::verify_oop_bits();
669    __ mov(rdx, rax);
670    __ andptr(rdx, oop_mask);
671    __ cmpptr(rdx, oop_bits);
672    __ jcc(Assembler::notZero, error);
673
674    // make sure klass is 'reasonable', which is not zero.
675    __ movptr(rax, Address(rax, oopDesc::klass_offset_in_bytes())); // get klass
676    __ testptr(rax, rax);
677    __ jcc(Assembler::zero, error);              // if klass is NULL it is broken
678
679    // return if everything seems ok
680    __ bind(exit);
681    __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
682    __ pop(rdx);                                 // restore rdx
683    __ popf();                                   // restore EFLAGS
684    __ ret(3 * wordSize);                        // pop arguments
685
686    // handle errors
687    __ bind(error);
688    __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
689    __ pop(rdx);                                 // get saved rdx back
690    __ popf();                                   // get saved EFLAGS off stack -- will be ignored
691    __ pusha();                                  // push registers (eip = return address & msg are already pushed)
692    BLOCK_COMMENT("call MacroAssembler::debug");
693    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
694    __ popa();
695    __ ret(3 * wordSize);                        // pop arguments
696    return start;
697  }
698
699  //
700  //  Generate pre-barrier for array stores
701  //
702  //  Input:
703  //     start   -  starting address
704  //     count   -  element count
705  void  gen_write_ref_array_pre_barrier(Register start, Register count, bool uninitialized_target) {
706    assert_different_registers(start, count);
707    BarrierSet* bs = Universe::heap()->barrier_set();
708    switch (bs->kind()) {
709      case BarrierSet::G1SATBCTLogging:
710        // With G1, don't generate the call if we statically know that the target in uninitialized
711        if (!uninitialized_target) {
712           __ pusha();                      // push registers
713           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre),
714                           start, count);
715           __ popa();
716         }
717        break;
718      case BarrierSet::CardTableModRef:
719      case BarrierSet::CardTableExtension:
720      case BarrierSet::ModRef:
721        break;
722      default      :
723        ShouldNotReachHere();
724
725    }
726  }
727
728
729  //
730  // Generate a post-barrier for an array store
731  //
732  //     start    -  starting address
733  //     count    -  element count
734  //
735  //  The two input registers are overwritten.
736  //
737  void  gen_write_ref_array_post_barrier(Register start, Register count) {
738    BarrierSet* bs = Universe::heap()->barrier_set();
739    assert_different_registers(start, count);
740    switch (bs->kind()) {
741      case BarrierSet::G1SATBCTLogging:
742        {
743          __ pusha();                      // push registers
744          __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post),
745                          start, count);
746          __ popa();
747        }
748        break;
749
750      case BarrierSet::CardTableModRef:
751      case BarrierSet::CardTableExtension:
752        {
753          CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
754          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
755
756          Label L_loop;
757          const Register end = count;  // elements count; end == start+count-1
758          assert_different_registers(start, end);
759
760          __ lea(end,  Address(start, count, Address::times_ptr, -wordSize));
761          __ shrptr(start, CardTableModRefBS::card_shift);
762          __ shrptr(end,   CardTableModRefBS::card_shift);
763          __ subptr(end, start); // end --> count
764        __ BIND(L_loop);
765          intptr_t disp = (intptr_t) ct->byte_map_base;
766          Address cardtable(start, count, Address::times_1, disp);
767          __ movb(cardtable, 0);
768          __ decrement(count);
769          __ jcc(Assembler::greaterEqual, L_loop);
770        }
771        break;
772      case BarrierSet::ModRef:
773        break;
774      default      :
775        ShouldNotReachHere();
776
777    }
778  }
779
780
781  // Copy 64 bytes chunks
782  //
783  // Inputs:
784  //   from        - source array address
785  //   to_from     - destination array address - from
786  //   qword_count - 8-bytes element count, negative
787  //
788  void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
789    assert( UseSSE >= 2, "supported cpu only" );
790    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
791    // Copy 64-byte chunks
792    __ jmpb(L_copy_64_bytes);
793    __ align(OptoLoopAlignment);
794  __ BIND(L_copy_64_bytes_loop);
795
796    if (UseUnalignedLoadStores) {
797      if (UseAVX >= 2) {
798        __ vmovdqu(xmm0, Address(from,  0));
799        __ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
800        __ vmovdqu(xmm1, Address(from, 32));
801        __ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1);
802      } else {
803        __ movdqu(xmm0, Address(from, 0));
804        __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
805        __ movdqu(xmm1, Address(from, 16));
806        __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
807        __ movdqu(xmm2, Address(from, 32));
808        __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
809        __ movdqu(xmm3, Address(from, 48));
810        __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
811      }
812    } else {
813      __ movq(xmm0, Address(from, 0));
814      __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
815      __ movq(xmm1, Address(from, 8));
816      __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
817      __ movq(xmm2, Address(from, 16));
818      __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
819      __ movq(xmm3, Address(from, 24));
820      __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
821      __ movq(xmm4, Address(from, 32));
822      __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
823      __ movq(xmm5, Address(from, 40));
824      __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
825      __ movq(xmm6, Address(from, 48));
826      __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
827      __ movq(xmm7, Address(from, 56));
828      __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
829    }
830
831    __ addl(from, 64);
832  __ BIND(L_copy_64_bytes);
833    __ subl(qword_count, 8);
834    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
835
836    if (UseUnalignedLoadStores && (UseAVX >= 2)) {
837      // clean upper bits of YMM registers
838      __ vzeroupper();
839    }
840    __ addl(qword_count, 8);
841    __ jccb(Assembler::zero, L_exit);
842    //
843    // length is too short, just copy qwords
844    //
845  __ BIND(L_copy_8_bytes);
846    __ movq(xmm0, Address(from, 0));
847    __ movq(Address(from, to_from, Address::times_1), xmm0);
848    __ addl(from, 8);
849    __ decrement(qword_count);
850    __ jcc(Assembler::greater, L_copy_8_bytes);
851  __ BIND(L_exit);
852  }
853
854  // Copy 64 bytes chunks
855  //
856  // Inputs:
857  //   from        - source array address
858  //   to_from     - destination array address - from
859  //   qword_count - 8-bytes element count, negative
860  //
861  void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
862    assert( VM_Version::supports_mmx(), "supported cpu only" );
863    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
864    // Copy 64-byte chunks
865    __ jmpb(L_copy_64_bytes);
866    __ align(OptoLoopAlignment);
867  __ BIND(L_copy_64_bytes_loop);
868    __ movq(mmx0, Address(from, 0));
869    __ movq(mmx1, Address(from, 8));
870    __ movq(mmx2, Address(from, 16));
871    __ movq(Address(from, to_from, Address::times_1, 0), mmx0);
872    __ movq(mmx3, Address(from, 24));
873    __ movq(Address(from, to_from, Address::times_1, 8), mmx1);
874    __ movq(mmx4, Address(from, 32));
875    __ movq(Address(from, to_from, Address::times_1, 16), mmx2);
876    __ movq(mmx5, Address(from, 40));
877    __ movq(Address(from, to_from, Address::times_1, 24), mmx3);
878    __ movq(mmx6, Address(from, 48));
879    __ movq(Address(from, to_from, Address::times_1, 32), mmx4);
880    __ movq(mmx7, Address(from, 56));
881    __ movq(Address(from, to_from, Address::times_1, 40), mmx5);
882    __ movq(Address(from, to_from, Address::times_1, 48), mmx6);
883    __ movq(Address(from, to_from, Address::times_1, 56), mmx7);
884    __ addptr(from, 64);
885  __ BIND(L_copy_64_bytes);
886    __ subl(qword_count, 8);
887    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
888    __ addl(qword_count, 8);
889    __ jccb(Assembler::zero, L_exit);
890    //
891    // length is too short, just copy qwords
892    //
893  __ BIND(L_copy_8_bytes);
894    __ movq(mmx0, Address(from, 0));
895    __ movq(Address(from, to_from, Address::times_1), mmx0);
896    __ addptr(from, 8);
897    __ decrement(qword_count);
898    __ jcc(Assembler::greater, L_copy_8_bytes);
899  __ BIND(L_exit);
900    __ emms();
901  }
902
903  address generate_disjoint_copy(BasicType t, bool aligned,
904                                 Address::ScaleFactor sf,
905                                 address* entry, const char *name,
906                                 bool dest_uninitialized = false) {
907    __ align(CodeEntryAlignment);
908    StubCodeMark mark(this, "StubRoutines", name);
909    address start = __ pc();
910
911    Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
912    Label L_copy_2_bytes, L_copy_4_bytes, L_copy_64_bytes;
913
914    int shift = Address::times_ptr - sf;
915
916    const Register from     = rsi;  // source array address
917    const Register to       = rdi;  // destination array address
918    const Register count    = rcx;  // elements count
919    const Register to_from  = to;   // (to - from)
920    const Register saved_to = rdx;  // saved destination array address
921
922    __ enter(); // required for proper stackwalking of RuntimeStub frame
923    __ push(rsi);
924    __ push(rdi);
925    __ movptr(from , Address(rsp, 12+ 4));
926    __ movptr(to   , Address(rsp, 12+ 8));
927    __ movl(count, Address(rsp, 12+ 12));
928
929    if (entry != NULL) {
930      *entry = __ pc(); // Entry point from conjoint arraycopy stub.
931      BLOCK_COMMENT("Entry:");
932    }
933
934    if (t == T_OBJECT) {
935      __ testl(count, count);
936      __ jcc(Assembler::zero, L_0_count);
937      gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
938      __ mov(saved_to, to);          // save 'to'
939    }
940
941    __ subptr(to, from); // to --> to_from
942    __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
943    __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
944    if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
945      // align source address at 4 bytes address boundary
946      if (t == T_BYTE) {
947        // One byte misalignment happens only for byte arrays
948        __ testl(from, 1);
949        __ jccb(Assembler::zero, L_skip_align1);
950        __ movb(rax, Address(from, 0));
951        __ movb(Address(from, to_from, Address::times_1, 0), rax);
952        __ increment(from);
953        __ decrement(count);
954      __ BIND(L_skip_align1);
955      }
956      // Two bytes misalignment happens only for byte and short (char) arrays
957      __ testl(from, 2);
958      __ jccb(Assembler::zero, L_skip_align2);
959      __ movw(rax, Address(from, 0));
960      __ movw(Address(from, to_from, Address::times_1, 0), rax);
961      __ addptr(from, 2);
962      __ subl(count, 1<<(shift-1));
963    __ BIND(L_skip_align2);
964    }
965    if (!VM_Version::supports_mmx()) {
966      __ mov(rax, count);      // save 'count'
967      __ shrl(count, shift); // bytes count
968      __ addptr(to_from, from);// restore 'to'
969      __ rep_mov();
970      __ subptr(to_from, from);// restore 'to_from'
971      __ mov(count, rax);      // restore 'count'
972      __ jmpb(L_copy_2_bytes); // all dwords were copied
973    } else {
974      if (!UseUnalignedLoadStores) {
975        // align to 8 bytes, we know we are 4 byte aligned to start
976        __ testptr(from, 4);
977        __ jccb(Assembler::zero, L_copy_64_bytes);
978        __ movl(rax, Address(from, 0));
979        __ movl(Address(from, to_from, Address::times_1, 0), rax);
980        __ addptr(from, 4);
981        __ subl(count, 1<<shift);
982      }
983    __ BIND(L_copy_64_bytes);
984      __ mov(rax, count);
985      __ shrl(rax, shift+1);  // 8 bytes chunk count
986      //
987      // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop
988      //
989      if (UseXMMForArrayCopy) {
990        xmm_copy_forward(from, to_from, rax);
991      } else {
992        mmx_copy_forward(from, to_from, rax);
993      }
994    }
995    // copy tailing dword
996  __ BIND(L_copy_4_bytes);
997    __ testl(count, 1<<shift);
998    __ jccb(Assembler::zero, L_copy_2_bytes);
999    __ movl(rax, Address(from, 0));
1000    __ movl(Address(from, to_from, Address::times_1, 0), rax);
1001    if (t == T_BYTE || t == T_SHORT) {
1002      __ addptr(from, 4);
1003    __ BIND(L_copy_2_bytes);
1004      // copy tailing word
1005      __ testl(count, 1<<(shift-1));
1006      __ jccb(Assembler::zero, L_copy_byte);
1007      __ movw(rax, Address(from, 0));
1008      __ movw(Address(from, to_from, Address::times_1, 0), rax);
1009      if (t == T_BYTE) {
1010        __ addptr(from, 2);
1011      __ BIND(L_copy_byte);
1012        // copy tailing byte
1013        __ testl(count, 1);
1014        __ jccb(Assembler::zero, L_exit);
1015        __ movb(rax, Address(from, 0));
1016        __ movb(Address(from, to_from, Address::times_1, 0), rax);
1017      __ BIND(L_exit);
1018      } else {
1019      __ BIND(L_copy_byte);
1020      }
1021    } else {
1022    __ BIND(L_copy_2_bytes);
1023    }
1024
1025    if (t == T_OBJECT) {
1026      __ movl(count, Address(rsp, 12+12)); // reread 'count'
1027      __ mov(to, saved_to); // restore 'to'
1028      gen_write_ref_array_post_barrier(to, count);
1029    __ BIND(L_0_count);
1030    }
1031    inc_copy_counter_np(t);
1032    __ pop(rdi);
1033    __ pop(rsi);
1034    __ leave(); // required for proper stackwalking of RuntimeStub frame
1035    __ xorptr(rax, rax); // return 0
1036    __ ret(0);
1037    return start;
1038  }
1039
1040
1041  address generate_fill(BasicType t, bool aligned, const char *name) {
1042    __ align(CodeEntryAlignment);
1043    StubCodeMark mark(this, "StubRoutines", name);
1044    address start = __ pc();
1045
1046    BLOCK_COMMENT("Entry:");
1047
1048    const Register to       = rdi;  // source array address
1049    const Register value    = rdx;  // value
1050    const Register count    = rsi;  // elements count
1051
1052    __ enter(); // required for proper stackwalking of RuntimeStub frame
1053    __ push(rsi);
1054    __ push(rdi);
1055    __ movptr(to   , Address(rsp, 12+ 4));
1056    __ movl(value, Address(rsp, 12+ 8));
1057    __ movl(count, Address(rsp, 12+ 12));
1058
1059    __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1060
1061    __ pop(rdi);
1062    __ pop(rsi);
1063    __ leave(); // required for proper stackwalking of RuntimeStub frame
1064    __ ret(0);
1065    return start;
1066  }
1067
1068  address generate_conjoint_copy(BasicType t, bool aligned,
1069                                 Address::ScaleFactor sf,
1070                                 address nooverlap_target,
1071                                 address* entry, const char *name,
1072                                 bool dest_uninitialized = false) {
1073    __ align(CodeEntryAlignment);
1074    StubCodeMark mark(this, "StubRoutines", name);
1075    address start = __ pc();
1076
1077    Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
1078    Label L_copy_2_bytes, L_copy_4_bytes, L_copy_8_bytes, L_copy_8_bytes_loop;
1079
1080    int shift = Address::times_ptr - sf;
1081
1082    const Register src   = rax;  // source array address
1083    const Register dst   = rdx;  // destination array address
1084    const Register from  = rsi;  // source array address
1085    const Register to    = rdi;  // destination array address
1086    const Register count = rcx;  // elements count
1087    const Register end   = rax;  // array end address
1088
1089    __ enter(); // required for proper stackwalking of RuntimeStub frame
1090    __ push(rsi);
1091    __ push(rdi);
1092    __ movptr(src  , Address(rsp, 12+ 4));   // from
1093    __ movptr(dst  , Address(rsp, 12+ 8));   // to
1094    __ movl2ptr(count, Address(rsp, 12+12)); // count
1095
1096    if (entry != NULL) {
1097      *entry = __ pc(); // Entry point from generic arraycopy stub.
1098      BLOCK_COMMENT("Entry:");
1099    }
1100
1101    // nooverlap_target expects arguments in rsi and rdi.
1102    __ mov(from, src);
1103    __ mov(to  , dst);
1104
1105    // arrays overlap test: dispatch to disjoint stub if necessary.
1106    RuntimeAddress nooverlap(nooverlap_target);
1107    __ cmpptr(dst, src);
1108    __ lea(end, Address(src, count, sf, 0)); // src + count * elem_size
1109    __ jump_cc(Assembler::belowEqual, nooverlap);
1110    __ cmpptr(dst, end);
1111    __ jump_cc(Assembler::aboveEqual, nooverlap);
1112
1113    if (t == T_OBJECT) {
1114      __ testl(count, count);
1115      __ jcc(Assembler::zero, L_0_count);
1116      gen_write_ref_array_pre_barrier(dst, count, dest_uninitialized);
1117    }
1118
1119    // copy from high to low
1120    __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1121    __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
1122    if (t == T_BYTE || t == T_SHORT) {
1123      // Align the end of destination array at 4 bytes address boundary
1124      __ lea(end, Address(dst, count, sf, 0));
1125      if (t == T_BYTE) {
1126        // One byte misalignment happens only for byte arrays
1127        __ testl(end, 1);
1128        __ jccb(Assembler::zero, L_skip_align1);
1129        __ decrement(count);
1130        __ movb(rdx, Address(from, count, sf, 0));
1131        __ movb(Address(to, count, sf, 0), rdx);
1132      __ BIND(L_skip_align1);
1133      }
1134      // Two bytes misalignment happens only for byte and short (char) arrays
1135      __ testl(end, 2);
1136      __ jccb(Assembler::zero, L_skip_align2);
1137      __ subptr(count, 1<<(shift-1));
1138      __ movw(rdx, Address(from, count, sf, 0));
1139      __ movw(Address(to, count, sf, 0), rdx);
1140    __ BIND(L_skip_align2);
1141      __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1142      __ jcc(Assembler::below, L_copy_4_bytes);
1143    }
1144
1145    if (!VM_Version::supports_mmx()) {
1146      __ std();
1147      __ mov(rax, count); // Save 'count'
1148      __ mov(rdx, to);    // Save 'to'
1149      __ lea(rsi, Address(from, count, sf, -4));
1150      __ lea(rdi, Address(to  , count, sf, -4));
1151      __ shrptr(count, shift); // bytes count
1152      __ rep_mov();
1153      __ cld();
1154      __ mov(count, rax); // restore 'count'
1155      __ andl(count, (1<<shift)-1);      // mask the number of rest elements
1156      __ movptr(from, Address(rsp, 12+4)); // reread 'from'
1157      __ mov(to, rdx);   // restore 'to'
1158      __ jmpb(L_copy_2_bytes); // all dword were copied
1159   } else {
1160      // Align to 8 bytes the end of array. It is aligned to 4 bytes already.
1161      __ testptr(end, 4);
1162      __ jccb(Assembler::zero, L_copy_8_bytes);
1163      __ subl(count, 1<<shift);
1164      __ movl(rdx, Address(from, count, sf, 0));
1165      __ movl(Address(to, count, sf, 0), rdx);
1166      __ jmpb(L_copy_8_bytes);
1167
1168      __ align(OptoLoopAlignment);
1169      // Move 8 bytes
1170    __ BIND(L_copy_8_bytes_loop);
1171      if (UseXMMForArrayCopy) {
1172        __ movq(xmm0, Address(from, count, sf, 0));
1173        __ movq(Address(to, count, sf, 0), xmm0);
1174      } else {
1175        __ movq(mmx0, Address(from, count, sf, 0));
1176        __ movq(Address(to, count, sf, 0), mmx0);
1177      }
1178    __ BIND(L_copy_8_bytes);
1179      __ subl(count, 2<<shift);
1180      __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1181      __ addl(count, 2<<shift);
1182      if (!UseXMMForArrayCopy) {
1183        __ emms();
1184      }
1185    }
1186  __ BIND(L_copy_4_bytes);
1187    // copy prefix qword
1188    __ testl(count, 1<<shift);
1189    __ jccb(Assembler::zero, L_copy_2_bytes);
1190    __ movl(rdx, Address(from, count, sf, -4));
1191    __ movl(Address(to, count, sf, -4), rdx);
1192
1193    if (t == T_BYTE || t == T_SHORT) {
1194        __ subl(count, (1<<shift));
1195      __ BIND(L_copy_2_bytes);
1196        // copy prefix dword
1197        __ testl(count, 1<<(shift-1));
1198        __ jccb(Assembler::zero, L_copy_byte);
1199        __ movw(rdx, Address(from, count, sf, -2));
1200        __ movw(Address(to, count, sf, -2), rdx);
1201        if (t == T_BYTE) {
1202          __ subl(count, 1<<(shift-1));
1203        __ BIND(L_copy_byte);
1204          // copy prefix byte
1205          __ testl(count, 1);
1206          __ jccb(Assembler::zero, L_exit);
1207          __ movb(rdx, Address(from, 0));
1208          __ movb(Address(to, 0), rdx);
1209        __ BIND(L_exit);
1210        } else {
1211        __ BIND(L_copy_byte);
1212        }
1213    } else {
1214    __ BIND(L_copy_2_bytes);
1215    }
1216    if (t == T_OBJECT) {
1217      __ movl2ptr(count, Address(rsp, 12+12)); // reread count
1218      gen_write_ref_array_post_barrier(to, count);
1219    __ BIND(L_0_count);
1220    }
1221    inc_copy_counter_np(t);
1222    __ pop(rdi);
1223    __ pop(rsi);
1224    __ leave(); // required for proper stackwalking of RuntimeStub frame
1225    __ xorptr(rax, rax); // return 0
1226    __ ret(0);
1227    return start;
1228  }
1229
1230
1231  address generate_disjoint_long_copy(address* entry, const char *name) {
1232    __ align(CodeEntryAlignment);
1233    StubCodeMark mark(this, "StubRoutines", name);
1234    address start = __ pc();
1235
1236    Label L_copy_8_bytes, L_copy_8_bytes_loop;
1237    const Register from       = rax;  // source array address
1238    const Register to         = rdx;  // destination array address
1239    const Register count      = rcx;  // elements count
1240    const Register to_from    = rdx;  // (to - from)
1241
1242    __ enter(); // required for proper stackwalking of RuntimeStub frame
1243    __ movptr(from , Address(rsp, 8+0));       // from
1244    __ movptr(to   , Address(rsp, 8+4));       // to
1245    __ movl2ptr(count, Address(rsp, 8+8));     // count
1246
1247    *entry = __ pc(); // Entry point from conjoint arraycopy stub.
1248    BLOCK_COMMENT("Entry:");
1249
1250    __ subptr(to, from); // to --> to_from
1251    if (VM_Version::supports_mmx()) {
1252      if (UseXMMForArrayCopy) {
1253        xmm_copy_forward(from, to_from, count);
1254      } else {
1255        mmx_copy_forward(from, to_from, count);
1256      }
1257    } else {
1258      __ jmpb(L_copy_8_bytes);
1259      __ align(OptoLoopAlignment);
1260    __ BIND(L_copy_8_bytes_loop);
1261      __ fild_d(Address(from, 0));
1262      __ fistp_d(Address(from, to_from, Address::times_1));
1263      __ addptr(from, 8);
1264    __ BIND(L_copy_8_bytes);
1265      __ decrement(count);
1266      __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1267    }
1268    inc_copy_counter_np(T_LONG);
1269    __ leave(); // required for proper stackwalking of RuntimeStub frame
1270    __ xorptr(rax, rax); // return 0
1271    __ ret(0);
1272    return start;
1273  }
1274
1275  address generate_conjoint_long_copy(address nooverlap_target,
1276                                      address* entry, const char *name) {
1277    __ align(CodeEntryAlignment);
1278    StubCodeMark mark(this, "StubRoutines", name);
1279    address start = __ pc();
1280
1281    Label L_copy_8_bytes, L_copy_8_bytes_loop;
1282    const Register from       = rax;  // source array address
1283    const Register to         = rdx;  // destination array address
1284    const Register count      = rcx;  // elements count
1285    const Register end_from   = rax;  // source array end address
1286
1287    __ enter(); // required for proper stackwalking of RuntimeStub frame
1288    __ movptr(from , Address(rsp, 8+0));       // from
1289    __ movptr(to   , Address(rsp, 8+4));       // to
1290    __ movl2ptr(count, Address(rsp, 8+8));     // count
1291
1292    *entry = __ pc(); // Entry point from generic arraycopy stub.
1293    BLOCK_COMMENT("Entry:");
1294
1295    // arrays overlap test
1296    __ cmpptr(to, from);
1297    RuntimeAddress nooverlap(nooverlap_target);
1298    __ jump_cc(Assembler::belowEqual, nooverlap);
1299    __ lea(end_from, Address(from, count, Address::times_8, 0));
1300    __ cmpptr(to, end_from);
1301    __ movptr(from, Address(rsp, 8));  // from
1302    __ jump_cc(Assembler::aboveEqual, nooverlap);
1303
1304    __ jmpb(L_copy_8_bytes);
1305
1306    __ align(OptoLoopAlignment);
1307  __ BIND(L_copy_8_bytes_loop);
1308    if (VM_Version::supports_mmx()) {
1309      if (UseXMMForArrayCopy) {
1310        __ movq(xmm0, Address(from, count, Address::times_8));
1311        __ movq(Address(to, count, Address::times_8), xmm0);
1312      } else {
1313        __ movq(mmx0, Address(from, count, Address::times_8));
1314        __ movq(Address(to, count, Address::times_8), mmx0);
1315      }
1316    } else {
1317      __ fild_d(Address(from, count, Address::times_8));
1318      __ fistp_d(Address(to, count, Address::times_8));
1319    }
1320  __ BIND(L_copy_8_bytes);
1321    __ decrement(count);
1322    __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1323
1324    if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
1325      __ emms();
1326    }
1327    inc_copy_counter_np(T_LONG);
1328    __ leave(); // required for proper stackwalking of RuntimeStub frame
1329    __ xorptr(rax, rax); // return 0
1330    __ ret(0);
1331    return start;
1332  }
1333
1334
1335  // Helper for generating a dynamic type check.
1336  // The sub_klass must be one of {rbx, rdx, rsi}.
1337  // The temp is killed.
1338  void generate_type_check(Register sub_klass,
1339                           Address& super_check_offset_addr,
1340                           Address& super_klass_addr,
1341                           Register temp,
1342                           Label* L_success, Label* L_failure) {
1343    BLOCK_COMMENT("type_check:");
1344
1345    Label L_fallthrough;
1346#define LOCAL_JCC(assembler_con, label_ptr)                             \
1347    if (label_ptr != NULL)  __ jcc(assembler_con, *(label_ptr));        \
1348    else                    __ jcc(assembler_con, L_fallthrough) /*omit semi*/
1349
1350    // The following is a strange variation of the fast path which requires
1351    // one less register, because needed values are on the argument stack.
1352    // __ check_klass_subtype_fast_path(sub_klass, *super_klass*, temp,
1353    //                                  L_success, L_failure, NULL);
1354    assert_different_registers(sub_klass, temp);
1355
1356    int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1357
1358    // if the pointers are equal, we are done (e.g., String[] elements)
1359    __ cmpptr(sub_klass, super_klass_addr);
1360    LOCAL_JCC(Assembler::equal, L_success);
1361
1362    // check the supertype display:
1363    __ movl2ptr(temp, super_check_offset_addr);
1364    Address super_check_addr(sub_klass, temp, Address::times_1, 0);
1365    __ movptr(temp, super_check_addr); // load displayed supertype
1366    __ cmpptr(temp, super_klass_addr); // test the super type
1367    LOCAL_JCC(Assembler::equal, L_success);
1368
1369    // if it was a primary super, we can just fail immediately
1370    __ cmpl(super_check_offset_addr, sc_offset);
1371    LOCAL_JCC(Assembler::notEqual, L_failure);
1372
1373    // The repne_scan instruction uses fixed registers, which will get spilled.
1374    // We happen to know this works best when super_klass is in rax.
1375    Register super_klass = temp;
1376    __ movptr(super_klass, super_klass_addr);
1377    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg,
1378                                     L_success, L_failure);
1379
1380    __ bind(L_fallthrough);
1381
1382    if (L_success == NULL) { BLOCK_COMMENT("L_success:"); }
1383    if (L_failure == NULL) { BLOCK_COMMENT("L_failure:"); }
1384
1385#undef LOCAL_JCC
1386  }
1387
1388  //
1389  //  Generate checkcasting array copy stub
1390  //
1391  //  Input:
1392  //    4(rsp)   - source array address
1393  //    8(rsp)   - destination array address
1394  //   12(rsp)   - element count, can be zero
1395  //   16(rsp)   - size_t ckoff (super_check_offset)
1396  //   20(rsp)   - oop ckval (super_klass)
1397  //
1398  //  Output:
1399  //    rax, ==  0  -  success
1400  //    rax, == -1^K - failure, where K is partial transfer count
1401  //
1402  address generate_checkcast_copy(const char *name, address* entry, bool dest_uninitialized = false) {
1403    __ align(CodeEntryAlignment);
1404    StubCodeMark mark(this, "StubRoutines", name);
1405    address start = __ pc();
1406
1407    Label L_load_element, L_store_element, L_do_card_marks, L_done;
1408
1409    // register use:
1410    //  rax, rdx, rcx -- loop control (end_from, end_to, count)
1411    //  rdi, rsi      -- element access (oop, klass)
1412    //  rbx,           -- temp
1413    const Register from       = rax;    // source array address
1414    const Register to         = rdx;    // destination array address
1415    const Register length     = rcx;    // elements count
1416    const Register elem       = rdi;    // each oop copied
1417    const Register elem_klass = rsi;    // each elem._klass (sub_klass)
1418    const Register temp       = rbx;    // lone remaining temp
1419
1420    __ enter(); // required for proper stackwalking of RuntimeStub frame
1421
1422    __ push(rsi);
1423    __ push(rdi);
1424    __ push(rbx);
1425
1426    Address   from_arg(rsp, 16+ 4);     // from
1427    Address     to_arg(rsp, 16+ 8);     // to
1428    Address length_arg(rsp, 16+12);     // elements count
1429    Address  ckoff_arg(rsp, 16+16);     // super_check_offset
1430    Address  ckval_arg(rsp, 16+20);     // super_klass
1431
1432    // Load up:
1433    __ movptr(from,     from_arg);
1434    __ movptr(to,         to_arg);
1435    __ movl2ptr(length, length_arg);
1436
1437    if (entry != NULL) {
1438      *entry = __ pc(); // Entry point from generic arraycopy stub.
1439      BLOCK_COMMENT("Entry:");
1440    }
1441
1442    //---------------------------------------------------------------
1443    // Assembler stub will be used for this call to arraycopy
1444    // if the two arrays are subtypes of Object[] but the
1445    // destination array type is not equal to or a supertype
1446    // of the source type.  Each element must be separately
1447    // checked.
1448
1449    // Loop-invariant addresses.  They are exclusive end pointers.
1450    Address end_from_addr(from, length, Address::times_ptr, 0);
1451    Address   end_to_addr(to,   length, Address::times_ptr, 0);
1452
1453    Register end_from = from;           // re-use
1454    Register end_to   = to;             // re-use
1455    Register count    = length;         // re-use
1456
1457    // Loop-variant addresses.  They assume post-incremented count < 0.
1458    Address from_element_addr(end_from, count, Address::times_ptr, 0);
1459    Address   to_element_addr(end_to,   count, Address::times_ptr, 0);
1460    Address elem_klass_addr(elem, oopDesc::klass_offset_in_bytes());
1461
1462    // Copy from low to high addresses, indexed from the end of each array.
1463    gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1464    __ lea(end_from, end_from_addr);
1465    __ lea(end_to,   end_to_addr);
1466    assert(length == count, "");        // else fix next line:
1467    __ negptr(count);                   // negate and test the length
1468    __ jccb(Assembler::notZero, L_load_element);
1469
1470    // Empty array:  Nothing to do.
1471    __ xorptr(rax, rax);                  // return 0 on (trivial) success
1472    __ jmp(L_done);
1473
1474    // ======== begin loop ========
1475    // (Loop is rotated; its entry is L_load_element.)
1476    // Loop control:
1477    //   for (count = -count; count != 0; count++)
1478    // Base pointers src, dst are biased by 8*count,to last element.
1479    __ align(OptoLoopAlignment);
1480
1481    __ BIND(L_store_element);
1482    __ movptr(to_element_addr, elem);     // store the oop
1483    __ increment(count);                // increment the count toward zero
1484    __ jccb(Assembler::zero, L_do_card_marks);
1485
1486    // ======== loop entry is here ========
1487    __ BIND(L_load_element);
1488    __ movptr(elem, from_element_addr);   // load the oop
1489    __ testptr(elem, elem);
1490    __ jccb(Assembler::zero, L_store_element);
1491
1492    // (Could do a trick here:  Remember last successful non-null
1493    // element stored and make a quick oop equality check on it.)
1494
1495    __ movptr(elem_klass, elem_klass_addr); // query the object klass
1496    generate_type_check(elem_klass, ckoff_arg, ckval_arg, temp,
1497                        &L_store_element, NULL);
1498    // (On fall-through, we have failed the element type check.)
1499    // ======== end loop ========
1500
1501    // It was a real error; we must depend on the caller to finish the job.
1502    // Register "count" = -1 * number of *remaining* oops, length_arg = *total* oops.
1503    // Emit GC store barriers for the oops we have copied (length_arg + count),
1504    // and report their number to the caller.
1505    assert_different_registers(to, count, rax);
1506    Label L_post_barrier;
1507    __ addl(count, length_arg);         // transfers = (length - remaining)
1508    __ movl2ptr(rax, count);            // save the value
1509    __ notptr(rax);                     // report (-1^K) to caller (does not affect flags)
1510    __ jccb(Assembler::notZero, L_post_barrier);
1511    __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
1512
1513    // Come here on success only.
1514    __ BIND(L_do_card_marks);
1515    __ xorptr(rax, rax);                // return 0 on success
1516    __ movl2ptr(count, length_arg);
1517
1518    __ BIND(L_post_barrier);
1519    __ movptr(to, to_arg);              // reload
1520    gen_write_ref_array_post_barrier(to, count);
1521
1522    // Common exit point (success or failure).
1523    __ BIND(L_done);
1524    __ pop(rbx);
1525    __ pop(rdi);
1526    __ pop(rsi);
1527    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1528    __ leave(); // required for proper stackwalking of RuntimeStub frame
1529    __ ret(0);
1530
1531    return start;
1532  }
1533
1534  //
1535  //  Generate 'unsafe' array copy stub
1536  //  Though just as safe as the other stubs, it takes an unscaled
1537  //  size_t argument instead of an element count.
1538  //
1539  //  Input:
1540  //    4(rsp)   - source array address
1541  //    8(rsp)   - destination array address
1542  //   12(rsp)   - byte count, can be zero
1543  //
1544  //  Output:
1545  //    rax, ==  0  -  success
1546  //    rax, == -1  -  need to call System.arraycopy
1547  //
1548  // Examines the alignment of the operands and dispatches
1549  // to a long, int, short, or byte copy loop.
1550  //
1551  address generate_unsafe_copy(const char *name,
1552                               address byte_copy_entry,
1553                               address short_copy_entry,
1554                               address int_copy_entry,
1555                               address long_copy_entry) {
1556
1557    Label L_long_aligned, L_int_aligned, L_short_aligned;
1558
1559    __ align(CodeEntryAlignment);
1560    StubCodeMark mark(this, "StubRoutines", name);
1561    address start = __ pc();
1562
1563    const Register from       = rax;  // source array address
1564    const Register to         = rdx;  // destination array address
1565    const Register count      = rcx;  // elements count
1566
1567    __ enter(); // required for proper stackwalking of RuntimeStub frame
1568    __ push(rsi);
1569    __ push(rdi);
1570    Address  from_arg(rsp, 12+ 4);      // from
1571    Address    to_arg(rsp, 12+ 8);      // to
1572    Address count_arg(rsp, 12+12);      // byte count
1573
1574    // Load up:
1575    __ movptr(from ,  from_arg);
1576    __ movptr(to   ,    to_arg);
1577    __ movl2ptr(count, count_arg);
1578
1579    // bump this on entry, not on exit:
1580    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1581
1582    const Register bits = rsi;
1583    __ mov(bits, from);
1584    __ orptr(bits, to);
1585    __ orptr(bits, count);
1586
1587    __ testl(bits, BytesPerLong-1);
1588    __ jccb(Assembler::zero, L_long_aligned);
1589
1590    __ testl(bits, BytesPerInt-1);
1591    __ jccb(Assembler::zero, L_int_aligned);
1592
1593    __ testl(bits, BytesPerShort-1);
1594    __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
1595
1596    __ BIND(L_short_aligned);
1597    __ shrptr(count, LogBytesPerShort); // size => short_count
1598    __ movl(count_arg, count);          // update 'count'
1599    __ jump(RuntimeAddress(short_copy_entry));
1600
1601    __ BIND(L_int_aligned);
1602    __ shrptr(count, LogBytesPerInt); // size => int_count
1603    __ movl(count_arg, count);          // update 'count'
1604    __ jump(RuntimeAddress(int_copy_entry));
1605
1606    __ BIND(L_long_aligned);
1607    __ shrptr(count, LogBytesPerLong); // size => qword_count
1608    __ movl(count_arg, count);          // update 'count'
1609    __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1610    __ pop(rsi);
1611    __ jump(RuntimeAddress(long_copy_entry));
1612
1613    return start;
1614  }
1615
1616
1617  // Perform range checks on the proposed arraycopy.
1618  // Smashes src_pos and dst_pos.  (Uses them up for temps.)
1619  void arraycopy_range_checks(Register src,
1620                              Register src_pos,
1621                              Register dst,
1622                              Register dst_pos,
1623                              Address& length,
1624                              Label& L_failed) {
1625    BLOCK_COMMENT("arraycopy_range_checks:");
1626    const Register src_end = src_pos;   // source array end position
1627    const Register dst_end = dst_pos;   // destination array end position
1628    __ addl(src_end, length); // src_pos + length
1629    __ addl(dst_end, length); // dst_pos + length
1630
1631    //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
1632    __ cmpl(src_end, Address(src, arrayOopDesc::length_offset_in_bytes()));
1633    __ jcc(Assembler::above, L_failed);
1634
1635    //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
1636    __ cmpl(dst_end, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1637    __ jcc(Assembler::above, L_failed);
1638
1639    BLOCK_COMMENT("arraycopy_range_checks done");
1640  }
1641
1642
1643  //
1644  //  Generate generic array copy stubs
1645  //
1646  //  Input:
1647  //     4(rsp)    -  src oop
1648  //     8(rsp)    -  src_pos
1649  //    12(rsp)    -  dst oop
1650  //    16(rsp)    -  dst_pos
1651  //    20(rsp)    -  element count
1652  //
1653  //  Output:
1654  //    rax, ==  0  -  success
1655  //    rax, == -1^K - failure, where K is partial transfer count
1656  //
1657  address generate_generic_copy(const char *name,
1658                                address entry_jbyte_arraycopy,
1659                                address entry_jshort_arraycopy,
1660                                address entry_jint_arraycopy,
1661                                address entry_oop_arraycopy,
1662                                address entry_jlong_arraycopy,
1663                                address entry_checkcast_arraycopy) {
1664    Label L_failed, L_failed_0, L_objArray;
1665
1666    { int modulus = CodeEntryAlignment;
1667      int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
1668      int advance = target - (__ offset() % modulus);
1669      if (advance < 0)  advance += modulus;
1670      if (advance > 0)  __ nop(advance);
1671    }
1672    StubCodeMark mark(this, "StubRoutines", name);
1673
1674    // Short-hop target to L_failed.  Makes for denser prologue code.
1675    __ BIND(L_failed_0);
1676    __ jmp(L_failed);
1677    assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
1678
1679    __ align(CodeEntryAlignment);
1680    address start = __ pc();
1681
1682    __ enter(); // required for proper stackwalking of RuntimeStub frame
1683    __ push(rsi);
1684    __ push(rdi);
1685
1686    // bump this on entry, not on exit:
1687    inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1688
1689    // Input values
1690    Address SRC     (rsp, 12+ 4);
1691    Address SRC_POS (rsp, 12+ 8);
1692    Address DST     (rsp, 12+12);
1693    Address DST_POS (rsp, 12+16);
1694    Address LENGTH  (rsp, 12+20);
1695
1696    //-----------------------------------------------------------------------
1697    // Assembler stub will be used for this call to arraycopy
1698    // if the following conditions are met:
1699    //
1700    // (1) src and dst must not be null.
1701    // (2) src_pos must not be negative.
1702    // (3) dst_pos must not be negative.
1703    // (4) length  must not be negative.
1704    // (5) src klass and dst klass should be the same and not NULL.
1705    // (6) src and dst should be arrays.
1706    // (7) src_pos + length must not exceed length of src.
1707    // (8) dst_pos + length must not exceed length of dst.
1708    //
1709
1710    const Register src     = rax;       // source array oop
1711    const Register src_pos = rsi;
1712    const Register dst     = rdx;       // destination array oop
1713    const Register dst_pos = rdi;
1714    const Register length  = rcx;       // transfer count
1715
1716    //  if (src == NULL) return -1;
1717    __ movptr(src, SRC);      // src oop
1718    __ testptr(src, src);
1719    __ jccb(Assembler::zero, L_failed_0);
1720
1721    //  if (src_pos < 0) return -1;
1722    __ movl2ptr(src_pos, SRC_POS);  // src_pos
1723    __ testl(src_pos, src_pos);
1724    __ jccb(Assembler::negative, L_failed_0);
1725
1726    //  if (dst == NULL) return -1;
1727    __ movptr(dst, DST);      // dst oop
1728    __ testptr(dst, dst);
1729    __ jccb(Assembler::zero, L_failed_0);
1730
1731    //  if (dst_pos < 0) return -1;
1732    __ movl2ptr(dst_pos, DST_POS);  // dst_pos
1733    __ testl(dst_pos, dst_pos);
1734    __ jccb(Assembler::negative, L_failed_0);
1735
1736    //  if (length < 0) return -1;
1737    __ movl2ptr(length, LENGTH);   // length
1738    __ testl(length, length);
1739    __ jccb(Assembler::negative, L_failed_0);
1740
1741    //  if (src->klass() == NULL) return -1;
1742    Address src_klass_addr(src, oopDesc::klass_offset_in_bytes());
1743    Address dst_klass_addr(dst, oopDesc::klass_offset_in_bytes());
1744    const Register rcx_src_klass = rcx;    // array klass
1745    __ movptr(rcx_src_klass, Address(src, oopDesc::klass_offset_in_bytes()));
1746
1747#ifdef ASSERT
1748    //  assert(src->klass() != NULL);
1749    BLOCK_COMMENT("assert klasses not null");
1750    { Label L1, L2;
1751      __ testptr(rcx_src_klass, rcx_src_klass);
1752      __ jccb(Assembler::notZero, L2);   // it is broken if klass is NULL
1753      __ bind(L1);
1754      __ stop("broken null klass");
1755      __ bind(L2);
1756      __ cmpptr(dst_klass_addr, (int32_t)NULL_WORD);
1757      __ jccb(Assembler::equal, L1);      // this would be broken also
1758      BLOCK_COMMENT("assert done");
1759    }
1760#endif //ASSERT
1761
1762    // Load layout helper (32-bits)
1763    //
1764    //  |array_tag|     | header_size | element_type |     |log2_element_size|
1765    // 32        30    24            16              8     2                 0
1766    //
1767    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1768    //
1769
1770    int lh_offset = in_bytes(Klass::layout_helper_offset());
1771    Address src_klass_lh_addr(rcx_src_klass, lh_offset);
1772
1773    // Handle objArrays completely differently...
1774    jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1775    __ cmpl(src_klass_lh_addr, objArray_lh);
1776    __ jcc(Assembler::equal, L_objArray);
1777
1778    //  if (src->klass() != dst->klass()) return -1;
1779    __ cmpptr(rcx_src_klass, dst_klass_addr);
1780    __ jccb(Assembler::notEqual, L_failed_0);
1781
1782    const Register rcx_lh = rcx;  // layout helper
1783    assert(rcx_lh == rcx_src_klass, "known alias");
1784    __ movl(rcx_lh, src_klass_lh_addr);
1785
1786    //  if (!src->is_Array()) return -1;
1787    __ cmpl(rcx_lh, Klass::_lh_neutral_value);
1788    __ jcc(Assembler::greaterEqual, L_failed_0); // signed cmp
1789
1790    // At this point, it is known to be a typeArray (array_tag 0x3).
1791#ifdef ASSERT
1792    { Label L;
1793      __ cmpl(rcx_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1794      __ jcc(Assembler::greaterEqual, L); // signed cmp
1795      __ stop("must be a primitive array");
1796      __ bind(L);
1797    }
1798#endif
1799
1800    assert_different_registers(src, src_pos, dst, dst_pos, rcx_lh);
1801    arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1802
1803    // TypeArrayKlass
1804    //
1805    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
1806    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
1807    //
1808    const Register rsi_offset = rsi; // array offset
1809    const Register src_array  = src; // src array offset
1810    const Register dst_array  = dst; // dst array offset
1811    const Register rdi_elsize = rdi; // log2 element size
1812
1813    __ mov(rsi_offset, rcx_lh);
1814    __ shrptr(rsi_offset, Klass::_lh_header_size_shift);
1815    __ andptr(rsi_offset, Klass::_lh_header_size_mask);   // array_offset
1816    __ addptr(src_array, rsi_offset);  // src array offset
1817    __ addptr(dst_array, rsi_offset);  // dst array offset
1818    __ andptr(rcx_lh, Klass::_lh_log2_element_size_mask); // log2 elsize
1819
1820    // next registers should be set before the jump to corresponding stub
1821    const Register from       = src; // source array address
1822    const Register to         = dst; // destination array address
1823    const Register count      = rcx; // elements count
1824    // some of them should be duplicated on stack
1825#define FROM   Address(rsp, 12+ 4)
1826#define TO     Address(rsp, 12+ 8)   // Not used now
1827#define COUNT  Address(rsp, 12+12)   // Only for oop arraycopy
1828
1829    BLOCK_COMMENT("scale indexes to element size");
1830    __ movl2ptr(rsi, SRC_POS);  // src_pos
1831    __ shlptr(rsi);             // src_pos << rcx (log2 elsize)
1832    assert(src_array == from, "");
1833    __ addptr(from, rsi);       // from = src_array + SRC_POS << log2 elsize
1834    __ movl2ptr(rdi, DST_POS);  // dst_pos
1835    __ shlptr(rdi);             // dst_pos << rcx (log2 elsize)
1836    assert(dst_array == to, "");
1837    __ addptr(to,  rdi);        // to   = dst_array + DST_POS << log2 elsize
1838    __ movptr(FROM, from);      // src_addr
1839    __ mov(rdi_elsize, rcx_lh); // log2 elsize
1840    __ movl2ptr(count, LENGTH); // elements count
1841
1842    BLOCK_COMMENT("choose copy loop based on element size");
1843    __ cmpl(rdi_elsize, 0);
1844
1845    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jbyte_arraycopy));
1846    __ cmpl(rdi_elsize, LogBytesPerShort);
1847    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jshort_arraycopy));
1848    __ cmpl(rdi_elsize, LogBytesPerInt);
1849    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jint_arraycopy));
1850#ifdef ASSERT
1851    __ cmpl(rdi_elsize, LogBytesPerLong);
1852    __ jccb(Assembler::notEqual, L_failed);
1853#endif
1854    __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1855    __ pop(rsi);
1856    __ jump(RuntimeAddress(entry_jlong_arraycopy));
1857
1858  __ BIND(L_failed);
1859    __ xorptr(rax, rax);
1860    __ notptr(rax); // return -1
1861    __ pop(rdi);
1862    __ pop(rsi);
1863    __ leave(); // required for proper stackwalking of RuntimeStub frame
1864    __ ret(0);
1865
1866    // ObjArrayKlass
1867  __ BIND(L_objArray);
1868    // live at this point:  rcx_src_klass, src[_pos], dst[_pos]
1869
1870    Label L_plain_copy, L_checkcast_copy;
1871    //  test array classes for subtyping
1872    __ cmpptr(rcx_src_klass, dst_klass_addr); // usual case is exact equality
1873    __ jccb(Assembler::notEqual, L_checkcast_copy);
1874
1875    // Identically typed arrays can be copied without element-wise checks.
1876    assert_different_registers(src, src_pos, dst, dst_pos, rcx_src_klass);
1877    arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1878
1879  __ BIND(L_plain_copy);
1880    __ movl2ptr(count, LENGTH); // elements count
1881    __ movl2ptr(src_pos, SRC_POS);  // reload src_pos
1882    __ lea(from, Address(src, src_pos, Address::times_ptr,
1883                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
1884    __ movl2ptr(dst_pos, DST_POS);  // reload dst_pos
1885    __ lea(to,   Address(dst, dst_pos, Address::times_ptr,
1886                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
1887    __ movptr(FROM,  from);   // src_addr
1888    __ movptr(TO,    to);     // dst_addr
1889    __ movl(COUNT, count);  // count
1890    __ jump(RuntimeAddress(entry_oop_arraycopy));
1891
1892  __ BIND(L_checkcast_copy);
1893    // live at this point:  rcx_src_klass, dst[_pos], src[_pos]
1894    {
1895      // Handy offsets:
1896      int  ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1897      int sco_offset = in_bytes(Klass::super_check_offset_offset());
1898
1899      Register rsi_dst_klass = rsi;
1900      Register rdi_temp      = rdi;
1901      assert(rsi_dst_klass == src_pos, "expected alias w/ src_pos");
1902      assert(rdi_temp      == dst_pos, "expected alias w/ dst_pos");
1903      Address dst_klass_lh_addr(rsi_dst_klass, lh_offset);
1904
1905      // Before looking at dst.length, make sure dst is also an objArray.
1906      __ movptr(rsi_dst_klass, dst_klass_addr);
1907      __ cmpl(dst_klass_lh_addr, objArray_lh);
1908      __ jccb(Assembler::notEqual, L_failed);
1909
1910      // It is safe to examine both src.length and dst.length.
1911      __ movl2ptr(src_pos, SRC_POS);        // reload rsi
1912      arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1913      // (Now src_pos and dst_pos are killed, but not src and dst.)
1914
1915      // We'll need this temp (don't forget to pop it after the type check).
1916      __ push(rbx);
1917      Register rbx_src_klass = rbx;
1918
1919      __ mov(rbx_src_klass, rcx_src_klass); // spill away from rcx
1920      __ movptr(rsi_dst_klass, dst_klass_addr);
1921      Address super_check_offset_addr(rsi_dst_klass, sco_offset);
1922      Label L_fail_array_check;
1923      generate_type_check(rbx_src_klass,
1924                          super_check_offset_addr, dst_klass_addr,
1925                          rdi_temp, NULL, &L_fail_array_check);
1926      // (On fall-through, we have passed the array type check.)
1927      __ pop(rbx);
1928      __ jmp(L_plain_copy);
1929
1930      __ BIND(L_fail_array_check);
1931      // Reshuffle arguments so we can call checkcast_arraycopy:
1932
1933      // match initial saves for checkcast_arraycopy
1934      // push(rsi);    // already done; see above
1935      // push(rdi);    // already done; see above
1936      // push(rbx);    // already done; see above
1937
1938      // Marshal outgoing arguments now, freeing registers.
1939      Address   from_arg(rsp, 16+ 4);   // from
1940      Address     to_arg(rsp, 16+ 8);   // to
1941      Address length_arg(rsp, 16+12);   // elements count
1942      Address  ckoff_arg(rsp, 16+16);   // super_check_offset
1943      Address  ckval_arg(rsp, 16+20);   // super_klass
1944
1945      Address SRC_POS_arg(rsp, 16+ 8);
1946      Address DST_POS_arg(rsp, 16+16);
1947      Address  LENGTH_arg(rsp, 16+20);
1948      // push rbx, changed the incoming offsets (why not just use rbp,??)
1949      // assert(SRC_POS_arg.disp() == SRC_POS.disp() + 4, "");
1950
1951      __ movptr(rbx, Address(rsi_dst_klass, ek_offset));
1952      __ movl2ptr(length, LENGTH_arg);    // reload elements count
1953      __ movl2ptr(src_pos, SRC_POS_arg);  // reload src_pos
1954      __ movl2ptr(dst_pos, DST_POS_arg);  // reload dst_pos
1955
1956      __ movptr(ckval_arg, rbx);          // destination element type
1957      __ movl(rbx, Address(rbx, sco_offset));
1958      __ movl(ckoff_arg, rbx);          // corresponding class check offset
1959
1960      __ movl(length_arg, length);      // outgoing length argument
1961
1962      __ lea(from, Address(src, src_pos, Address::times_ptr,
1963                            arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
1964      __ movptr(from_arg, from);
1965
1966      __ lea(to, Address(dst, dst_pos, Address::times_ptr,
1967                          arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
1968      __ movptr(to_arg, to);
1969      __ jump(RuntimeAddress(entry_checkcast_arraycopy));
1970    }
1971
1972    return start;
1973  }
1974
1975  void generate_arraycopy_stubs() {
1976    address entry;
1977    address entry_jbyte_arraycopy;
1978    address entry_jshort_arraycopy;
1979    address entry_jint_arraycopy;
1980    address entry_oop_arraycopy;
1981    address entry_jlong_arraycopy;
1982    address entry_checkcast_arraycopy;
1983
1984    StubRoutines::_arrayof_jbyte_disjoint_arraycopy =
1985        generate_disjoint_copy(T_BYTE,  true, Address::times_1, &entry,
1986                               "arrayof_jbyte_disjoint_arraycopy");
1987    StubRoutines::_arrayof_jbyte_arraycopy =
1988        generate_conjoint_copy(T_BYTE,  true, Address::times_1,  entry,
1989                               NULL, "arrayof_jbyte_arraycopy");
1990    StubRoutines::_jbyte_disjoint_arraycopy =
1991        generate_disjoint_copy(T_BYTE, false, Address::times_1, &entry,
1992                               "jbyte_disjoint_arraycopy");
1993    StubRoutines::_jbyte_arraycopy =
1994        generate_conjoint_copy(T_BYTE, false, Address::times_1,  entry,
1995                               &entry_jbyte_arraycopy, "jbyte_arraycopy");
1996
1997    StubRoutines::_arrayof_jshort_disjoint_arraycopy =
1998        generate_disjoint_copy(T_SHORT,  true, Address::times_2, &entry,
1999                               "arrayof_jshort_disjoint_arraycopy");
2000    StubRoutines::_arrayof_jshort_arraycopy =
2001        generate_conjoint_copy(T_SHORT,  true, Address::times_2,  entry,
2002                               NULL, "arrayof_jshort_arraycopy");
2003    StubRoutines::_jshort_disjoint_arraycopy =
2004        generate_disjoint_copy(T_SHORT, false, Address::times_2, &entry,
2005                               "jshort_disjoint_arraycopy");
2006    StubRoutines::_jshort_arraycopy =
2007        generate_conjoint_copy(T_SHORT, false, Address::times_2,  entry,
2008                               &entry_jshort_arraycopy, "jshort_arraycopy");
2009
2010    // Next arrays are always aligned on 4 bytes at least.
2011    StubRoutines::_jint_disjoint_arraycopy =
2012        generate_disjoint_copy(T_INT, true, Address::times_4, &entry,
2013                               "jint_disjoint_arraycopy");
2014    StubRoutines::_jint_arraycopy =
2015        generate_conjoint_copy(T_INT, true, Address::times_4,  entry,
2016                               &entry_jint_arraycopy, "jint_arraycopy");
2017
2018    StubRoutines::_oop_disjoint_arraycopy =
2019        generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2020                               "oop_disjoint_arraycopy");
2021    StubRoutines::_oop_arraycopy =
2022        generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
2023                               &entry_oop_arraycopy, "oop_arraycopy");
2024
2025    StubRoutines::_oop_disjoint_arraycopy_uninit =
2026        generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2027                               "oop_disjoint_arraycopy_uninit",
2028                               /*dest_uninitialized*/true);
2029    StubRoutines::_oop_arraycopy_uninit =
2030        generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
2031                               NULL, "oop_arraycopy_uninit",
2032                               /*dest_uninitialized*/true);
2033
2034    StubRoutines::_jlong_disjoint_arraycopy =
2035        generate_disjoint_long_copy(&entry, "jlong_disjoint_arraycopy");
2036    StubRoutines::_jlong_arraycopy =
2037        generate_conjoint_long_copy(entry, &entry_jlong_arraycopy,
2038                                    "jlong_arraycopy");
2039
2040    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2041    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2042    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2043    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2044    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2045    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2046
2047    StubRoutines::_arrayof_jint_disjoint_arraycopy       = StubRoutines::_jint_disjoint_arraycopy;
2048    StubRoutines::_arrayof_oop_disjoint_arraycopy        = StubRoutines::_oop_disjoint_arraycopy;
2049    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
2050    StubRoutines::_arrayof_jlong_disjoint_arraycopy      = StubRoutines::_jlong_disjoint_arraycopy;
2051
2052    StubRoutines::_arrayof_jint_arraycopy       = StubRoutines::_jint_arraycopy;
2053    StubRoutines::_arrayof_oop_arraycopy        = StubRoutines::_oop_arraycopy;
2054    StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
2055    StubRoutines::_arrayof_jlong_arraycopy      = StubRoutines::_jlong_arraycopy;
2056
2057    StubRoutines::_checkcast_arraycopy =
2058        generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2059    StubRoutines::_checkcast_arraycopy_uninit =
2060        generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, /*dest_uninitialized*/true);
2061
2062    StubRoutines::_unsafe_arraycopy =
2063        generate_unsafe_copy("unsafe_arraycopy",
2064                               entry_jbyte_arraycopy,
2065                               entry_jshort_arraycopy,
2066                               entry_jint_arraycopy,
2067                               entry_jlong_arraycopy);
2068
2069    StubRoutines::_generic_arraycopy =
2070        generate_generic_copy("generic_arraycopy",
2071                               entry_jbyte_arraycopy,
2072                               entry_jshort_arraycopy,
2073                               entry_jint_arraycopy,
2074                               entry_oop_arraycopy,
2075                               entry_jlong_arraycopy,
2076                               entry_checkcast_arraycopy);
2077  }
2078
2079  void generate_math_stubs() {
2080    {
2081      StubCodeMark mark(this, "StubRoutines", "log");
2082      StubRoutines::_intrinsic_log = (double (*)(double)) __ pc();
2083
2084      __ fld_d(Address(rsp, 4));
2085      __ flog();
2086      __ ret(0);
2087    }
2088    {
2089      StubCodeMark mark(this, "StubRoutines", "log10");
2090      StubRoutines::_intrinsic_log10 = (double (*)(double)) __ pc();
2091
2092      __ fld_d(Address(rsp, 4));
2093      __ flog10();
2094      __ ret(0);
2095    }
2096    {
2097      StubCodeMark mark(this, "StubRoutines", "sin");
2098      StubRoutines::_intrinsic_sin = (double (*)(double))  __ pc();
2099
2100      __ fld_d(Address(rsp, 4));
2101      __ trigfunc('s');
2102      __ ret(0);
2103    }
2104    {
2105      StubCodeMark mark(this, "StubRoutines", "cos");
2106      StubRoutines::_intrinsic_cos = (double (*)(double)) __ pc();
2107
2108      __ fld_d(Address(rsp, 4));
2109      __ trigfunc('c');
2110      __ ret(0);
2111    }
2112    {
2113      StubCodeMark mark(this, "StubRoutines", "tan");
2114      StubRoutines::_intrinsic_tan = (double (*)(double)) __ pc();
2115
2116      __ fld_d(Address(rsp, 4));
2117      __ trigfunc('t');
2118      __ ret(0);
2119    }
2120    {
2121      StubCodeMark mark(this, "StubRoutines", "exp");
2122      StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc();
2123
2124      __ fld_d(Address(rsp, 4));
2125      __ exp_with_fallback(0);
2126      __ ret(0);
2127    }
2128    {
2129      StubCodeMark mark(this, "StubRoutines", "pow");
2130      StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
2131
2132      __ fld_d(Address(rsp, 12));
2133      __ fld_d(Address(rsp, 4));
2134      __ pow_with_fallback(0);
2135      __ ret(0);
2136    }
2137  }
2138
2139  // AES intrinsic stubs
2140  enum {AESBlockSize = 16};
2141
2142  address generate_key_shuffle_mask() {
2143    __ align(16);
2144    StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2145    address start = __ pc();
2146    __ emit_data(0x00010203, relocInfo::none, 0 );
2147    __ emit_data(0x04050607, relocInfo::none, 0 );
2148    __ emit_data(0x08090a0b, relocInfo::none, 0 );
2149    __ emit_data(0x0c0d0e0f, relocInfo::none, 0 );
2150    return start;
2151  }
2152
2153  // Utility routine for loading a 128-bit key word in little endian format
2154  // can optionally specify that the shuffle mask is already in an xmmregister
2155  void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2156    __ movdqu(xmmdst, Address(key, offset));
2157    if (xmm_shuf_mask != NULL) {
2158      __ pshufb(xmmdst, xmm_shuf_mask);
2159    } else {
2160      __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2161    }
2162  }
2163
2164  // aesenc using specified key+offset
2165  // can optionally specify that the shuffle mask is already in an xmmregister
2166  void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2167    load_key(xmmtmp, key, offset, xmm_shuf_mask);
2168    __ aesenc(xmmdst, xmmtmp);
2169  }
2170
2171  // aesdec using specified key+offset
2172  // can optionally specify that the shuffle mask is already in an xmmregister
2173  void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2174    load_key(xmmtmp, key, offset, xmm_shuf_mask);
2175    __ aesdec(xmmdst, xmmtmp);
2176  }
2177
2178
2179  // Arguments:
2180  //
2181  // Inputs:
2182  //   c_rarg0   - source byte array address
2183  //   c_rarg1   - destination byte array address
2184  //   c_rarg2   - K (key) in little endian int array
2185  //
2186  address generate_aescrypt_encryptBlock() {
2187    assert(UseAES, "need AES instructions and misaligned SSE support");
2188    __ align(CodeEntryAlignment);
2189    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2190    Label L_doLast;
2191    address start = __ pc();
2192
2193    const Register from        = rdx;      // source array address
2194    const Register to          = rdx;      // destination array address
2195    const Register key         = rcx;      // key array address
2196    const Register keylen      = rax;
2197    const Address  from_param(rbp, 8+0);
2198    const Address  to_param  (rbp, 8+4);
2199    const Address  key_param (rbp, 8+8);
2200
2201    const XMMRegister xmm_result = xmm0;
2202    const XMMRegister xmm_key_shuf_mask = xmm1;
2203    const XMMRegister xmm_temp1  = xmm2;
2204    const XMMRegister xmm_temp2  = xmm3;
2205    const XMMRegister xmm_temp3  = xmm4;
2206    const XMMRegister xmm_temp4  = xmm5;
2207
2208    __ enter();   // required for proper stackwalking of RuntimeStub frame
2209    __ movptr(from, from_param);
2210    __ movptr(key, key_param);
2211
2212    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2213    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2214
2215    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2216    __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
2217    __ movptr(to, to_param);
2218
2219    // For encryption, the java expanded key ordering is just what we need
2220
2221    load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
2222    __ pxor(xmm_result, xmm_temp1);
2223
2224    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2225    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2226    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2227    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2228
2229    __ aesenc(xmm_result, xmm_temp1);
2230    __ aesenc(xmm_result, xmm_temp2);
2231    __ aesenc(xmm_result, xmm_temp3);
2232    __ aesenc(xmm_result, xmm_temp4);
2233
2234    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2235    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2236    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2237    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2238
2239    __ aesenc(xmm_result, xmm_temp1);
2240    __ aesenc(xmm_result, xmm_temp2);
2241    __ aesenc(xmm_result, xmm_temp3);
2242    __ aesenc(xmm_result, xmm_temp4);
2243
2244    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2245    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2246
2247    __ cmpl(keylen, 44);
2248    __ jccb(Assembler::equal, L_doLast);
2249
2250    __ aesenc(xmm_result, xmm_temp1);
2251    __ aesenc(xmm_result, xmm_temp2);
2252
2253    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2254    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2255
2256    __ cmpl(keylen, 52);
2257    __ jccb(Assembler::equal, L_doLast);
2258
2259    __ aesenc(xmm_result, xmm_temp1);
2260    __ aesenc(xmm_result, xmm_temp2);
2261
2262    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2263    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2264
2265    __ BIND(L_doLast);
2266    __ aesenc(xmm_result, xmm_temp1);
2267    __ aesenclast(xmm_result, xmm_temp2);
2268    __ movdqu(Address(to, 0), xmm_result);        // store the result
2269    __ xorptr(rax, rax); // return 0
2270    __ leave(); // required for proper stackwalking of RuntimeStub frame
2271    __ ret(0);
2272
2273    return start;
2274  }
2275
2276
2277  // Arguments:
2278  //
2279  // Inputs:
2280  //   c_rarg0   - source byte array address
2281  //   c_rarg1   - destination byte array address
2282  //   c_rarg2   - K (key) in little endian int array
2283  //
2284  address generate_aescrypt_decryptBlock() {
2285    assert(UseAES, "need AES instructions and misaligned SSE support");
2286    __ align(CodeEntryAlignment);
2287    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2288    Label L_doLast;
2289    address start = __ pc();
2290
2291    const Register from        = rdx;      // source array address
2292    const Register to          = rdx;      // destination array address
2293    const Register key         = rcx;      // key array address
2294    const Register keylen      = rax;
2295    const Address  from_param(rbp, 8+0);
2296    const Address  to_param  (rbp, 8+4);
2297    const Address  key_param (rbp, 8+8);
2298
2299    const XMMRegister xmm_result = xmm0;
2300    const XMMRegister xmm_key_shuf_mask = xmm1;
2301    const XMMRegister xmm_temp1  = xmm2;
2302    const XMMRegister xmm_temp2  = xmm3;
2303    const XMMRegister xmm_temp3  = xmm4;
2304    const XMMRegister xmm_temp4  = xmm5;
2305
2306    __ enter(); // required for proper stackwalking of RuntimeStub frame
2307    __ movptr(from, from_param);
2308    __ movptr(key, key_param);
2309
2310    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2311    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2312
2313    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2314    __ movdqu(xmm_result, Address(from, 0));
2315    __ movptr(to, to_param);
2316
2317    // for decryption java expanded key ordering is rotated one position from what we want
2318    // so we start from 0x10 here and hit 0x00 last
2319    // we don't know if the key is aligned, hence not using load-execute form
2320    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2321    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2322    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2323    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2324
2325    __ pxor  (xmm_result, xmm_temp1);
2326    __ aesdec(xmm_result, xmm_temp2);
2327    __ aesdec(xmm_result, xmm_temp3);
2328    __ aesdec(xmm_result, xmm_temp4);
2329
2330    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2331    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2332    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2333    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2334
2335    __ aesdec(xmm_result, xmm_temp1);
2336    __ aesdec(xmm_result, xmm_temp2);
2337    __ aesdec(xmm_result, xmm_temp3);
2338    __ aesdec(xmm_result, xmm_temp4);
2339
2340    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2341    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2342    load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
2343
2344    __ cmpl(keylen, 44);
2345    __ jccb(Assembler::equal, L_doLast);
2346
2347    __ aesdec(xmm_result, xmm_temp1);
2348    __ aesdec(xmm_result, xmm_temp2);
2349
2350    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2351    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2352
2353    __ cmpl(keylen, 52);
2354    __ jccb(Assembler::equal, L_doLast);
2355
2356    __ aesdec(xmm_result, xmm_temp1);
2357    __ aesdec(xmm_result, xmm_temp2);
2358
2359    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2360    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2361
2362    __ BIND(L_doLast);
2363    __ aesdec(xmm_result, xmm_temp1);
2364    __ aesdec(xmm_result, xmm_temp2);
2365
2366    // for decryption the aesdeclast operation is always on key+0x00
2367    __ aesdeclast(xmm_result, xmm_temp3);
2368    __ movdqu(Address(to, 0), xmm_result);  // store the result
2369    __ xorptr(rax, rax); // return 0
2370    __ leave(); // required for proper stackwalking of RuntimeStub frame
2371    __ ret(0);
2372
2373    return start;
2374  }
2375
2376  void handleSOERegisters(bool saving) {
2377    const int saveFrameSizeInBytes = 4 * wordSize;
2378    const Address saved_rbx     (rbp, -3 * wordSize);
2379    const Address saved_rsi     (rbp, -2 * wordSize);
2380    const Address saved_rdi     (rbp, -1 * wordSize);
2381
2382    if (saving) {
2383      __ subptr(rsp, saveFrameSizeInBytes);
2384      __ movptr(saved_rsi, rsi);
2385      __ movptr(saved_rdi, rdi);
2386      __ movptr(saved_rbx, rbx);
2387    } else {
2388      // restoring
2389      __ movptr(rsi, saved_rsi);
2390      __ movptr(rdi, saved_rdi);
2391      __ movptr(rbx, saved_rbx);
2392    }
2393  }
2394
2395  // Arguments:
2396  //
2397  // Inputs:
2398  //   c_rarg0   - source byte array address
2399  //   c_rarg1   - destination byte array address
2400  //   c_rarg2   - K (key) in little endian int array
2401  //   c_rarg3   - r vector byte array address
2402  //   c_rarg4   - input length
2403  //
2404  // Output:
2405  //   rax       - input length
2406  //
2407  address generate_cipherBlockChaining_encryptAESCrypt() {
2408    assert(UseAES, "need AES instructions and misaligned SSE support");
2409    __ align(CodeEntryAlignment);
2410    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2411    address start = __ pc();
2412
2413    Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
2414    const Register from        = rsi;      // source array address
2415    const Register to          = rdx;      // destination array address
2416    const Register key         = rcx;      // key array address
2417    const Register rvec        = rdi;      // r byte array initialized from initvector array address
2418                                           // and left with the results of the last encryption block
2419    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
2420    const Register pos         = rax;
2421
2422    // xmm register assignments for the loops below
2423    const XMMRegister xmm_result = xmm0;
2424    const XMMRegister xmm_temp   = xmm1;
2425    // first 6 keys preloaded into xmm2-xmm7
2426    const int XMM_REG_NUM_KEY_FIRST = 2;
2427    const int XMM_REG_NUM_KEY_LAST  = 7;
2428    const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
2429
2430    __ enter(); // required for proper stackwalking of RuntimeStub frame
2431    handleSOERegisters(true /*saving*/);
2432
2433    // load registers from incoming parameters
2434    const Address  from_param(rbp, 8+0);
2435    const Address  to_param  (rbp, 8+4);
2436    const Address  key_param (rbp, 8+8);
2437    const Address  rvec_param (rbp, 8+12);
2438    const Address  len_param  (rbp, 8+16);
2439    __ movptr(from , from_param);
2440    __ movptr(to   , to_param);
2441    __ movptr(key  , key_param);
2442    __ movptr(rvec , rvec_param);
2443    __ movptr(len_reg , len_param);
2444
2445    const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
2446    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2447    // load up xmm regs 2 thru 7 with keys 0-5
2448    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2449      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
2450      offset += 0x10;
2451    }
2452
2453    __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
2454
2455    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2456    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2457    __ cmpl(rax, 44);
2458    __ jcc(Assembler::notEqual, L_key_192_256);
2459
2460    // 128 bit code follows here
2461    __ movl(pos, 0);
2462    __ align(OptoLoopAlignment);
2463    __ BIND(L_loopTop_128);
2464    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2465    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2466
2467    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2468    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2469      __ aesenc(xmm_result, as_XMMRegister(rnum));
2470    }
2471    for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) {
2472      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2473    }
2474    load_key(xmm_temp, key, 0xa0);
2475    __ aesenclast(xmm_result, xmm_temp);
2476
2477    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
2478    // no need to store r to memory until we exit
2479    __ addptr(pos, AESBlockSize);
2480    __ subptr(len_reg, AESBlockSize);
2481    __ jcc(Assembler::notEqual, L_loopTop_128);
2482
2483    __ BIND(L_exit);
2484    __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
2485
2486    handleSOERegisters(false /*restoring*/);
2487    __ movptr(rax, len_param); // return length
2488    __ leave();                                  // required for proper stackwalking of RuntimeStub frame
2489    __ ret(0);
2490
2491    __ BIND(L_key_192_256);
2492    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
2493    __ cmpl(rax, 52);
2494    __ jcc(Assembler::notEqual, L_key_256);
2495
2496    // 192-bit code follows here (could be changed to use more xmm registers)
2497    __ movl(pos, 0);
2498    __ align(OptoLoopAlignment);
2499    __ BIND(L_loopTop_192);
2500    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2501    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2502
2503    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2504    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2505      __ aesenc(xmm_result, as_XMMRegister(rnum));
2506    }
2507    for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) {
2508      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2509    }
2510    load_key(xmm_temp, key, 0xc0);
2511    __ aesenclast(xmm_result, xmm_temp);
2512
2513    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
2514    // no need to store r to memory until we exit
2515    __ addptr(pos, AESBlockSize);
2516    __ subptr(len_reg, AESBlockSize);
2517    __ jcc(Assembler::notEqual, L_loopTop_192);
2518    __ jmp(L_exit);
2519
2520    __ BIND(L_key_256);
2521    // 256-bit code follows here (could be changed to use more xmm registers)
2522    __ movl(pos, 0);
2523    __ align(OptoLoopAlignment);
2524    __ BIND(L_loopTop_256);
2525    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2526    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2527
2528    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2529    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2530      __ aesenc(xmm_result, as_XMMRegister(rnum));
2531    }
2532    for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) {
2533      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2534    }
2535    load_key(xmm_temp, key, 0xe0);
2536    __ aesenclast(xmm_result, xmm_temp);
2537
2538    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
2539    // no need to store r to memory until we exit
2540    __ addptr(pos, AESBlockSize);
2541    __ subptr(len_reg, AESBlockSize);
2542    __ jcc(Assembler::notEqual, L_loopTop_256);
2543    __ jmp(L_exit);
2544
2545    return start;
2546  }
2547
2548
2549  // CBC AES Decryption.
2550  // In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time.
2551  //
2552  // Arguments:
2553  //
2554  // Inputs:
2555  //   c_rarg0   - source byte array address
2556  //   c_rarg1   - destination byte array address
2557  //   c_rarg2   - K (key) in little endian int array
2558  //   c_rarg3   - r vector byte array address
2559  //   c_rarg4   - input length
2560  //
2561  // Output:
2562  //   rax       - input length
2563  //
2564
2565  address generate_cipherBlockChaining_decryptAESCrypt() {
2566    assert(UseAES, "need AES instructions and misaligned SSE support");
2567    __ align(CodeEntryAlignment);
2568    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2569    address start = __ pc();
2570
2571    Label L_exit, L_key_192_256, L_key_256;
2572    Label L_singleBlock_loopTop_128;
2573    Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
2574    const Register from        = rsi;      // source array address
2575    const Register to          = rdx;      // destination array address
2576    const Register key         = rcx;      // key array address
2577    const Register rvec        = rdi;      // r byte array initialized from initvector array address
2578                                           // and left with the results of the last encryption block
2579    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
2580    const Register pos         = rax;
2581
2582    // xmm register assignments for the loops below
2583    const XMMRegister xmm_result = xmm0;
2584    const XMMRegister xmm_temp   = xmm1;
2585    // first 6 keys preloaded into xmm2-xmm7
2586    const int XMM_REG_NUM_KEY_FIRST = 2;
2587    const int XMM_REG_NUM_KEY_LAST  = 7;
2588    const int FIRST_NON_REG_KEY_offset = 0x70;
2589    const XMMRegister xmm_key_first   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
2590
2591    __ enter(); // required for proper stackwalking of RuntimeStub frame
2592    handleSOERegisters(true /*saving*/);
2593
2594    // load registers from incoming parameters
2595    const Address  from_param(rbp, 8+0);
2596    const Address  to_param  (rbp, 8+4);
2597    const Address  key_param (rbp, 8+8);
2598    const Address  rvec_param (rbp, 8+12);
2599    const Address  len_param  (rbp, 8+16);
2600    __ movptr(from , from_param);
2601    __ movptr(to   , to_param);
2602    __ movptr(key  , key_param);
2603    __ movptr(rvec , rvec_param);
2604    __ movptr(len_reg , len_param);
2605
2606    // the java expanded key ordering is rotated one position from what we want
2607    // so we start from 0x10 here and hit 0x00 last
2608    const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
2609    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2610    // load up xmm regs 2 thru 6 with first 5 keys
2611    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2612      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
2613      offset += 0x10;
2614    }
2615
2616    // inside here, use the rvec register to point to previous block cipher
2617    // with which we xor at the end of each newly decrypted block
2618    const Register  prev_block_cipher_ptr = rvec;
2619
2620    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2621    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2622    __ cmpl(rax, 44);
2623    __ jcc(Assembler::notEqual, L_key_192_256);
2624
2625
2626    // 128-bit code follows here, parallelized
2627    __ movl(pos, 0);
2628    __ align(OptoLoopAlignment);
2629    __ BIND(L_singleBlock_loopTop_128);
2630    __ cmpptr(len_reg, 0);           // any blocks left??
2631    __ jcc(Assembler::equal, L_exit);
2632    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
2633    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
2634    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2635      __ aesdec(xmm_result, as_XMMRegister(rnum));
2636    }
2637    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xa0; key_offset += 0x10) {   // 128-bit runs up to key offset a0
2638      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
2639    }
2640    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
2641    __ aesdeclast(xmm_result, xmm_temp);
2642    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
2643    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
2644    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
2645    // no need to store r to memory until we exit
2646    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
2647    __ addptr(pos, AESBlockSize);
2648    __ subptr(len_reg, AESBlockSize);
2649    __ jmp(L_singleBlock_loopTop_128);
2650
2651
2652    __ BIND(L_exit);
2653    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
2654    __ movptr(rvec , rvec_param);                                     // restore this since used in loop
2655    __ movdqu(Address(rvec, 0), xmm_temp);                            // final value of r stored in rvec of CipherBlockChaining object
2656    handleSOERegisters(false /*restoring*/);
2657    __ movptr(rax, len_param); // return length
2658    __ leave();                                                       // required for proper stackwalking of RuntimeStub frame
2659    __ ret(0);
2660
2661
2662    __ BIND(L_key_192_256);
2663    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
2664    __ cmpl(rax, 52);
2665    __ jcc(Assembler::notEqual, L_key_256);
2666
2667    // 192-bit code follows here (could be optimized to use parallelism)
2668    __ movl(pos, 0);
2669    __ align(OptoLoopAlignment);
2670    __ BIND(L_singleBlock_loopTop_192);
2671    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
2672    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
2673    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2674      __ aesdec(xmm_result, as_XMMRegister(rnum));
2675    }
2676    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xc0; key_offset += 0x10) {   // 192-bit runs up to key offset c0
2677      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
2678    }
2679    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
2680    __ aesdeclast(xmm_result, xmm_temp);
2681    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
2682    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
2683    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
2684    // no need to store r to memory until we exit
2685    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
2686    __ addptr(pos, AESBlockSize);
2687    __ subptr(len_reg, AESBlockSize);
2688    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
2689    __ jmp(L_exit);
2690
2691    __ BIND(L_key_256);
2692    // 256-bit code follows here (could be optimized to use parallelism)
2693    __ movl(pos, 0);
2694    __ align(OptoLoopAlignment);
2695    __ BIND(L_singleBlock_loopTop_256);
2696    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
2697    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
2698    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2699      __ aesdec(xmm_result, as_XMMRegister(rnum));
2700    }
2701    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xe0; key_offset += 0x10) {   // 256-bit runs up to key offset e0
2702      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
2703    }
2704    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
2705    __ aesdeclast(xmm_result, xmm_temp);
2706    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
2707    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
2708    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
2709    // no need to store r to memory until we exit
2710    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
2711    __ addptr(pos, AESBlockSize);
2712    __ subptr(len_reg, AESBlockSize);
2713    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
2714    __ jmp(L_exit);
2715
2716    return start;
2717  }
2718
2719  /**
2720   *  Arguments:
2721   *
2722   * Inputs:
2723   *   rsp(4)   - int crc
2724   *   rsp(8)   - byte* buf
2725   *   rsp(12)  - int length
2726   *
2727   * Ouput:
2728   *       rax   - int crc result
2729   */
2730  address generate_updateBytesCRC32() {
2731    assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
2732
2733    __ align(CodeEntryAlignment);
2734    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
2735
2736    address start = __ pc();
2737
2738    const Register crc   = rdx;  // crc
2739    const Register buf   = rsi;  // source java byte array address
2740    const Register len   = rcx;  // length
2741    const Register table = rdi;  // crc_table address (reuse register)
2742    const Register tmp   = rbx;
2743    assert_different_registers(crc, buf, len, table, tmp, rax);
2744
2745    BLOCK_COMMENT("Entry:");
2746    __ enter(); // required for proper stackwalking of RuntimeStub frame
2747    __ push(rsi);
2748    __ push(rdi);
2749    __ push(rbx);
2750
2751    Address crc_arg(rbp, 8 + 0);
2752    Address buf_arg(rbp, 8 + 4);
2753    Address len_arg(rbp, 8 + 8);
2754
2755    // Load up:
2756    __ movl(crc,   crc_arg);
2757    __ movptr(buf, buf_arg);
2758    __ movl(len,   len_arg);
2759
2760    __ kernel_crc32(crc, buf, len, table, tmp);
2761
2762    __ movl(rax, crc);
2763    __ pop(rbx);
2764    __ pop(rdi);
2765    __ pop(rsi);
2766    __ leave(); // required for proper stackwalking of RuntimeStub frame
2767    __ ret(0);
2768
2769    return start;
2770  }
2771
2772  // Safefetch stubs.
2773  void generate_safefetch(const char* name, int size, address* entry,
2774                          address* fault_pc, address* continuation_pc) {
2775    // safefetch signatures:
2776    //   int      SafeFetch32(int*      adr, int      errValue);
2777    //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2778
2779    StubCodeMark mark(this, "StubRoutines", name);
2780
2781    // Entry point, pc or function descriptor.
2782    *entry = __ pc();
2783
2784    __ movl(rax, Address(rsp, 0x8));
2785    __ movl(rcx, Address(rsp, 0x4));
2786    // Load *adr into eax, may fault.
2787    *fault_pc = __ pc();
2788    switch (size) {
2789      case 4:
2790        // int32_t
2791        __ movl(rax, Address(rcx, 0));
2792        break;
2793      case 8:
2794        // int64_t
2795        Unimplemented();
2796        break;
2797      default:
2798        ShouldNotReachHere();
2799    }
2800
2801    // Return errValue or *adr.
2802    *continuation_pc = __ pc();
2803    __ ret(0);
2804  }
2805
2806 public:
2807  // Information about frame layout at time of blocking runtime call.
2808  // Note that we only have to preserve callee-saved registers since
2809  // the compilers are responsible for supplying a continuation point
2810  // if they expect all registers to be preserved.
2811  enum layout {
2812    thread_off,    // last_java_sp
2813    arg1_off,
2814    arg2_off,
2815    rbp_off,       // callee saved register
2816    ret_pc,
2817    framesize
2818  };
2819
2820 private:
2821
2822#undef  __
2823#define __ masm->
2824
2825  //------------------------------------------------------------------------------------------------------------------------
2826  // Continuation point for throwing of implicit exceptions that are not handled in
2827  // the current activation. Fabricates an exception oop and initiates normal
2828  // exception dispatching in this frame.
2829  //
2830  // Previously the compiler (c2) allowed for callee save registers on Java calls.
2831  // This is no longer true after adapter frames were removed but could possibly
2832  // be brought back in the future if the interpreter code was reworked and it
2833  // was deemed worthwhile. The comment below was left to describe what must
2834  // happen here if callee saves were resurrected. As it stands now this stub
2835  // could actually be a vanilla BufferBlob and have now oopMap at all.
2836  // Since it doesn't make much difference we've chosen to leave it the
2837  // way it was in the callee save days and keep the comment.
2838
2839  // If we need to preserve callee-saved values we need a callee-saved oop map and
2840  // therefore have to make these stubs into RuntimeStubs rather than BufferBlobs.
2841  // If the compiler needs all registers to be preserved between the fault
2842  // point and the exception handler then it must assume responsibility for that in
2843  // AbstractCompiler::continuation_for_implicit_null_exception or
2844  // continuation_for_implicit_division_by_zero_exception. All other implicit
2845  // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
2846  // either at call sites or otherwise assume that stack unwinding will be initiated,
2847  // so caller saved registers were assumed volatile in the compiler.
2848  address generate_throw_exception(const char* name, address runtime_entry,
2849                                   Register arg1 = noreg, Register arg2 = noreg) {
2850
2851    int insts_size = 256;
2852    int locs_size  = 32;
2853
2854    CodeBuffer code(name, insts_size, locs_size);
2855    OopMapSet* oop_maps  = new OopMapSet();
2856    MacroAssembler* masm = new MacroAssembler(&code);
2857
2858    address start = __ pc();
2859
2860    // This is an inlined and slightly modified version of call_VM
2861    // which has the ability to fetch the return PC out of
2862    // thread-local storage and also sets up last_Java_sp slightly
2863    // differently than the real call_VM
2864    Register java_thread = rbx;
2865    __ get_thread(java_thread);
2866
2867    __ enter(); // required for proper stackwalking of RuntimeStub frame
2868
2869    // pc and rbp, already pushed
2870    __ subptr(rsp, (framesize-2) * wordSize); // prolog
2871
2872    // Frame is now completed as far as size and linkage.
2873
2874    int frame_complete = __ pc() - start;
2875
2876    // push java thread (becomes first argument of C function)
2877    __ movptr(Address(rsp, thread_off * wordSize), java_thread);
2878    if (arg1 != noreg) {
2879      __ movptr(Address(rsp, arg1_off * wordSize), arg1);
2880    }
2881    if (arg2 != noreg) {
2882      assert(arg1 != noreg, "missing reg arg");
2883      __ movptr(Address(rsp, arg2_off * wordSize), arg2);
2884    }
2885
2886    // Set up last_Java_sp and last_Java_fp
2887    __ set_last_Java_frame(java_thread, rsp, rbp, NULL);
2888
2889    // Call runtime
2890    BLOCK_COMMENT("call runtime_entry");
2891    __ call(RuntimeAddress(runtime_entry));
2892    // Generate oop map
2893    OopMap* map =  new OopMap(framesize, 0);
2894    oop_maps->add_gc_map(__ pc() - start, map);
2895
2896    // restore the thread (cannot use the pushed argument since arguments
2897    // may be overwritten by C code generated by an optimizing compiler);
2898    // however can use the register value directly if it is callee saved.
2899    __ get_thread(java_thread);
2900
2901    __ reset_last_Java_frame(java_thread, true, false);
2902
2903    __ leave(); // required for proper stackwalking of RuntimeStub frame
2904
2905    // check for pending exceptions
2906#ifdef ASSERT
2907    Label L;
2908    __ cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
2909    __ jcc(Assembler::notEqual, L);
2910    __ should_not_reach_here();
2911    __ bind(L);
2912#endif /* ASSERT */
2913    __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2914
2915
2916    RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, framesize, oop_maps, false);
2917    return stub->entry_point();
2918  }
2919
2920
2921  void create_control_words() {
2922    // Round to nearest, 53-bit mode, exceptions masked
2923    StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
2924    // Round to zero, 53-bit mode, exception mased
2925    StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
2926    // Round to nearest, 24-bit mode, exceptions masked
2927    StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
2928    // Round to nearest, 64-bit mode, exceptions masked
2929    StubRoutines::_fpu_cntrl_wrd_64    = 0x037F;
2930    // Round to nearest, 64-bit mode, exceptions masked
2931    StubRoutines::_mxcsr_std           = 0x1F80;
2932    // Note: the following two constants are 80-bit values
2933    //       layout is critical for correct loading by FPU.
2934    // Bias for strict fp multiply/divide
2935    StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
2936    StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
2937    StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
2938    // Un-Bias for strict fp multiply/divide
2939    StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
2940    StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
2941    StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
2942  }
2943
2944  //---------------------------------------------------------------------------
2945  // Initialization
2946
2947  void generate_initial() {
2948    // Generates all stubs and initializes the entry points
2949
2950    //------------------------------------------------------------------------------------------------------------------------
2951    // entry points that exist in all platforms
2952    // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
2953    //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
2954    StubRoutines::_forward_exception_entry      = generate_forward_exception();
2955
2956    StubRoutines::_call_stub_entry              =
2957      generate_call_stub(StubRoutines::_call_stub_return_address);
2958    // is referenced by megamorphic call
2959    StubRoutines::_catch_exception_entry        = generate_catch_exception();
2960
2961    // These are currently used by Solaris/Intel
2962    StubRoutines::_atomic_xchg_entry            = generate_atomic_xchg();
2963
2964    StubRoutines::_handler_for_unsafe_access_entry =
2965      generate_handler_for_unsafe_access();
2966
2967    // platform dependent
2968    create_control_words();
2969
2970    StubRoutines::x86::_verify_mxcsr_entry                 = generate_verify_mxcsr();
2971    StubRoutines::x86::_verify_fpu_cntrl_wrd_entry         = generate_verify_fpu_cntrl_wrd();
2972    StubRoutines::_d2i_wrapper                              = generate_d2i_wrapper(T_INT,
2973                                                                                   CAST_FROM_FN_PTR(address, SharedRuntime::d2i));
2974    StubRoutines::_d2l_wrapper                              = generate_d2i_wrapper(T_LONG,
2975                                                                                   CAST_FROM_FN_PTR(address, SharedRuntime::d2l));
2976
2977    // Build this early so it's available for the interpreter
2978    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
2979
2980    if (UseCRC32Intrinsics) {
2981      // set table address before stub generation which use it
2982      StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
2983      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
2984    }
2985  }
2986
2987
2988  void generate_all() {
2989    // Generates all stubs and initializes the entry points
2990
2991    // These entry points require SharedInfo::stack0 to be set up in non-core builds
2992    // and need to be relocatable, so they each fabricate a RuntimeStub internally.
2993    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
2994    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
2995    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
2996
2997    //------------------------------------------------------------------------------------------------------------------------
2998    // entry points that are platform specific
2999
3000    // support for verify_oop (must happen after universe_init)
3001    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
3002
3003    // arraycopy stubs used by compilers
3004    generate_arraycopy_stubs();
3005
3006    generate_math_stubs();
3007
3008    // don't bother generating these AES intrinsic stubs unless global flag is set
3009    if (UseAESIntrinsics) {
3010      StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // might be needed by the others
3011
3012      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
3013      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
3014      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
3015      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
3016    }
3017
3018    // Safefetch stubs.
3019    generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
3020                                                   &StubRoutines::_safefetch32_fault_pc,
3021                                                   &StubRoutines::_safefetch32_continuation_pc);
3022    StubRoutines::_safefetchN_entry           = StubRoutines::_safefetch32_entry;
3023    StubRoutines::_safefetchN_fault_pc        = StubRoutines::_safefetch32_fault_pc;
3024    StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc;
3025  }
3026
3027
3028 public:
3029  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3030    if (all) {
3031      generate_all();
3032    } else {
3033      generate_initial();
3034    }
3035  }
3036}; // end class declaration
3037
3038
3039void StubGenerator_generate(CodeBuffer* code, bool all) {
3040  StubGenerator g(code, all);
3041}
3042