stubGenerator_x86_32.cpp revision 579:0fbdb4381b99
1/*
2 * Copyright 1999-2009 Sun Microsystems, Inc.  All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
20 * CA 95054 USA or visit www.sun.com if you need additional information or
21 * have any questions.
22 *
23 */
24
25#include "incls/_precompiled.incl"
26#include "incls/_stubGenerator_x86_32.cpp.incl"
27
28// Declaration and definition of StubGenerator (no .hpp file).
29// For a more detailed description of the stub routine structure
30// see the comment in stubRoutines.hpp
31
32#define __ _masm->
33#define a__ ((Assembler*)_masm)->
34
35#ifdef PRODUCT
36#define BLOCK_COMMENT(str) /* nothing */
37#else
38#define BLOCK_COMMENT(str) __ block_comment(str)
39#endif
40
41#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
42
43const int MXCSR_MASK  = 0xFFC0;  // Mask out any pending exceptions
44const int FPU_CNTRL_WRD_MASK = 0xFFFF;
45
46// -------------------------------------------------------------------------------------------------------------------------
47// Stub Code definitions
48
49static address handle_unsafe_access() {
50  JavaThread* thread = JavaThread::current();
51  address pc  = thread->saved_exception_pc();
52  // pc is the instruction which we must emulate
53  // doing a no-op is fine:  return garbage from the load
54  // therefore, compute npc
55  address npc = Assembler::locate_next_instruction(pc);
56
57  // request an async exception
58  thread->set_pending_unsafe_access_error();
59
60  // return address of next instruction to execute
61  return npc;
62}
63
64class StubGenerator: public StubCodeGenerator {
65 private:
66
67#ifdef PRODUCT
68#define inc_counter_np(counter) (0)
69#else
70  void inc_counter_np_(int& counter) {
71    __ incrementl(ExternalAddress((address)&counter));
72  }
73#define inc_counter_np(counter) \
74  BLOCK_COMMENT("inc_counter " #counter); \
75  inc_counter_np_(counter);
76#endif //PRODUCT
77
78  void inc_copy_counter_np(BasicType t) {
79#ifndef PRODUCT
80    switch (t) {
81    case T_BYTE:    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); return;
82    case T_SHORT:   inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); return;
83    case T_INT:     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); return;
84    case T_LONG:    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); return;
85    case T_OBJECT:  inc_counter_np(SharedRuntime::_oop_array_copy_ctr); return;
86    }
87    ShouldNotReachHere();
88#endif //PRODUCT
89  }
90
91  //------------------------------------------------------------------------------------------------------------------------
92  // Call stubs are used to call Java from C
93  //
94  //    [ return_from_Java     ] <--- rsp
95  //    [ argument word n      ]
96  //      ...
97  // -N [ argument word 1      ]
98  // -7 [ Possible padding for stack alignment ]
99  // -6 [ Possible padding for stack alignment ]
100  // -5 [ Possible padding for stack alignment ]
101  // -4 [ mxcsr save           ] <--- rsp_after_call
102  // -3 [ saved rbx,            ]
103  // -2 [ saved rsi            ]
104  // -1 [ saved rdi            ]
105  //  0 [ saved rbp,            ] <--- rbp,
106  //  1 [ return address       ]
107  //  2 [ ptr. to call wrapper ]
108  //  3 [ result               ]
109  //  4 [ result_type          ]
110  //  5 [ method               ]
111  //  6 [ entry_point          ]
112  //  7 [ parameters           ]
113  //  8 [ parameter_size       ]
114  //  9 [ thread               ]
115
116
117  address generate_call_stub(address& return_address) {
118    StubCodeMark mark(this, "StubRoutines", "call_stub");
119    address start = __ pc();
120
121    // stub code parameters / addresses
122    assert(frame::entry_frame_call_wrapper_offset == 2, "adjust this code");
123    bool  sse_save = false;
124    const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_catch_exception()!
125    const int     locals_count_in_bytes  (4*wordSize);
126    const Address mxcsr_save    (rbp, -4 * wordSize);
127    const Address saved_rbx     (rbp, -3 * wordSize);
128    const Address saved_rsi     (rbp, -2 * wordSize);
129    const Address saved_rdi     (rbp, -1 * wordSize);
130    const Address result        (rbp,  3 * wordSize);
131    const Address result_type   (rbp,  4 * wordSize);
132    const Address method        (rbp,  5 * wordSize);
133    const Address entry_point   (rbp,  6 * wordSize);
134    const Address parameters    (rbp,  7 * wordSize);
135    const Address parameter_size(rbp,  8 * wordSize);
136    const Address thread        (rbp,  9 * wordSize); // same as in generate_catch_exception()!
137    sse_save =  UseSSE > 0;
138
139    // stub code
140    __ enter();
141    __ movptr(rcx, parameter_size);              // parameter counter
142    __ shlptr(rcx, Interpreter::logStackElementSize()); // convert parameter count to bytes
143    __ addptr(rcx, locals_count_in_bytes);       // reserve space for register saves
144    __ subptr(rsp, rcx);
145    __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
146
147    // save rdi, rsi, & rbx, according to C calling conventions
148    __ movptr(saved_rdi, rdi);
149    __ movptr(saved_rsi, rsi);
150    __ movptr(saved_rbx, rbx);
151    // save and initialize %mxcsr
152    if (sse_save) {
153      Label skip_ldmx;
154      __ stmxcsr(mxcsr_save);
155      __ movl(rax, mxcsr_save);
156      __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
157      ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
158      __ cmp32(rax, mxcsr_std);
159      __ jcc(Assembler::equal, skip_ldmx);
160      __ ldmxcsr(mxcsr_std);
161      __ bind(skip_ldmx);
162    }
163
164    // make sure the control word is correct.
165    __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
166
167#ifdef ASSERT
168    // make sure we have no pending exceptions
169    { Label L;
170      __ movptr(rcx, thread);
171      __ cmpptr(Address(rcx, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
172      __ jcc(Assembler::equal, L);
173      __ stop("StubRoutines::call_stub: entered with pending exception");
174      __ bind(L);
175    }
176#endif
177
178    // pass parameters if any
179    BLOCK_COMMENT("pass parameters if any");
180    Label parameters_done;
181    __ movl(rcx, parameter_size);  // parameter counter
182    __ testl(rcx, rcx);
183    __ jcc(Assembler::zero, parameters_done);
184
185    // parameter passing loop
186
187    Label loop;
188    // Copy Java parameters in reverse order (receiver last)
189    // Note that the argument order is inverted in the process
190    // source is rdx[rcx: N-1..0]
191    // dest   is rsp[rbx: 0..N-1]
192
193    __ movptr(rdx, parameters);          // parameter pointer
194    __ xorptr(rbx, rbx);
195
196    __ BIND(loop);
197    if (TaggedStackInterpreter) {
198      __ movptr(rax, Address(rdx, rcx, Interpreter::stackElementScale(),
199                      -2*wordSize));                          // get tag
200      __ movptr(Address(rsp, rbx, Interpreter::stackElementScale(),
201                      Interpreter::expr_tag_offset_in_bytes(0)), rax);     // store tag
202    }
203
204    // get parameter
205    __ movptr(rax, Address(rdx, rcx, Interpreter::stackElementScale(), -wordSize));
206    __ movptr(Address(rsp, rbx, Interpreter::stackElementScale(),
207                    Interpreter::expr_offset_in_bytes(0)), rax);          // store parameter
208    __ increment(rbx);
209    __ decrement(rcx);
210    __ jcc(Assembler::notZero, loop);
211
212    // call Java function
213    __ BIND(parameters_done);
214    __ movptr(rbx, method);           // get methodOop
215    __ movptr(rax, entry_point);      // get entry_point
216    __ mov(rsi, rsp);                 // set sender sp
217    BLOCK_COMMENT("call Java function");
218    __ call(rax);
219
220    BLOCK_COMMENT("call_stub_return_address:");
221    return_address = __ pc();
222
223    Label common_return;
224
225    __ BIND(common_return);
226
227    // store result depending on type
228    // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
229    __ movptr(rdi, result);
230    Label is_long, is_float, is_double, exit;
231    __ movl(rsi, result_type);
232    __ cmpl(rsi, T_LONG);
233    __ jcc(Assembler::equal, is_long);
234    __ cmpl(rsi, T_FLOAT);
235    __ jcc(Assembler::equal, is_float);
236    __ cmpl(rsi, T_DOUBLE);
237    __ jcc(Assembler::equal, is_double);
238
239    // handle T_INT case
240    __ movl(Address(rdi, 0), rax);
241    __ BIND(exit);
242
243    // check that FPU stack is empty
244    __ verify_FPU(0, "generate_call_stub");
245
246    // pop parameters
247    __ lea(rsp, rsp_after_call);
248
249    // restore %mxcsr
250    if (sse_save) {
251      __ ldmxcsr(mxcsr_save);
252    }
253
254    // restore rdi, rsi and rbx,
255    __ movptr(rbx, saved_rbx);
256    __ movptr(rsi, saved_rsi);
257    __ movptr(rdi, saved_rdi);
258    __ addptr(rsp, 4*wordSize);
259
260    // return
261    __ pop(rbp);
262    __ ret(0);
263
264    // handle return types different from T_INT
265    __ BIND(is_long);
266    __ movl(Address(rdi, 0 * wordSize), rax);
267    __ movl(Address(rdi, 1 * wordSize), rdx);
268    __ jmp(exit);
269
270    __ BIND(is_float);
271    // interpreter uses xmm0 for return values
272    if (UseSSE >= 1) {
273      __ movflt(Address(rdi, 0), xmm0);
274    } else {
275      __ fstp_s(Address(rdi, 0));
276    }
277    __ jmp(exit);
278
279    __ BIND(is_double);
280    // interpreter uses xmm0 for return values
281    if (UseSSE >= 2) {
282      __ movdbl(Address(rdi, 0), xmm0);
283    } else {
284      __ fstp_d(Address(rdi, 0));
285    }
286    __ jmp(exit);
287
288    // If we call compiled code directly from the call stub we will
289    // need to adjust the return back to the call stub to a specialized
290    // piece of code that can handle compiled results and cleaning the fpu
291    // stack. compiled code will be set to return here instead of the
292    // return above that handles interpreter returns.
293
294    BLOCK_COMMENT("call_stub_compiled_return:");
295    StubRoutines::x86::set_call_stub_compiled_return( __ pc());
296
297#ifdef COMPILER2
298    if (UseSSE >= 2) {
299      __ verify_FPU(0, "call_stub_compiled_return");
300    } else {
301      for (int i = 1; i < 8; i++) {
302        __ ffree(i);
303      }
304
305      // UseSSE <= 1 so double result should be left on TOS
306      __ movl(rsi, result_type);
307      __ cmpl(rsi, T_DOUBLE);
308      __ jcc(Assembler::equal, common_return);
309      if (UseSSE == 0) {
310        // UseSSE == 0 so float result should be left on TOS
311        __ cmpl(rsi, T_FLOAT);
312        __ jcc(Assembler::equal, common_return);
313      }
314      __ ffree(0);
315    }
316#endif /* COMPILER2 */
317    __ jmp(common_return);
318
319    return start;
320  }
321
322
323  //------------------------------------------------------------------------------------------------------------------------
324  // Return point for a Java call if there's an exception thrown in Java code.
325  // The exception is caught and transformed into a pending exception stored in
326  // JavaThread that can be tested from within the VM.
327  //
328  // Note: Usually the parameters are removed by the callee. In case of an exception
329  //       crossing an activation frame boundary, that is not the case if the callee
330  //       is compiled code => need to setup the rsp.
331  //
332  // rax,: exception oop
333
334  address generate_catch_exception() {
335    StubCodeMark mark(this, "StubRoutines", "catch_exception");
336    const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_call_stub()!
337    const Address thread        (rbp,  9 * wordSize); // same as in generate_call_stub()!
338    address start = __ pc();
339
340    // get thread directly
341    __ movptr(rcx, thread);
342#ifdef ASSERT
343    // verify that threads correspond
344    { Label L;
345      __ get_thread(rbx);
346      __ cmpptr(rbx, rcx);
347      __ jcc(Assembler::equal, L);
348      __ stop("StubRoutines::catch_exception: threads must correspond");
349      __ bind(L);
350    }
351#endif
352    // set pending exception
353    __ verify_oop(rax);
354    __ movptr(Address(rcx, Thread::pending_exception_offset()), rax          );
355    __ lea(Address(rcx, Thread::exception_file_offset   ()),
356           ExternalAddress((address)__FILE__));
357    __ movl(Address(rcx, Thread::exception_line_offset   ()), __LINE__ );
358    // complete return to VM
359    assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
360    __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
361
362    return start;
363  }
364
365
366  //------------------------------------------------------------------------------------------------------------------------
367  // Continuation point for runtime calls returning with a pending exception.
368  // The pending exception check happened in the runtime or native call stub.
369  // The pending exception in Thread is converted into a Java-level exception.
370  //
371  // Contract with Java-level exception handlers:
372  // rax,: exception
373  // rdx: throwing pc
374  //
375  // NOTE: At entry of this stub, exception-pc must be on stack !!
376
377  address generate_forward_exception() {
378    StubCodeMark mark(this, "StubRoutines", "forward exception");
379    address start = __ pc();
380
381    // Upon entry, the sp points to the return address returning into Java
382    // (interpreted or compiled) code; i.e., the return address becomes the
383    // throwing pc.
384    //
385    // Arguments pushed before the runtime call are still on the stack but
386    // the exception handler will reset the stack pointer -> ignore them.
387    // A potential result in registers can be ignored as well.
388
389#ifdef ASSERT
390    // make sure this code is only executed if there is a pending exception
391    { Label L;
392      __ get_thread(rcx);
393      __ cmpptr(Address(rcx, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
394      __ jcc(Assembler::notEqual, L);
395      __ stop("StubRoutines::forward exception: no pending exception (1)");
396      __ bind(L);
397    }
398#endif
399
400    // compute exception handler into rbx,
401    __ movptr(rax, Address(rsp, 0));
402    BLOCK_COMMENT("call exception_handler_for_return_address");
403    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rax);
404    __ mov(rbx, rax);
405
406    // setup rax, & rdx, remove return address & clear pending exception
407    __ get_thread(rcx);
408    __ pop(rdx);
409    __ movptr(rax, Address(rcx, Thread::pending_exception_offset()));
410    __ movptr(Address(rcx, Thread::pending_exception_offset()), NULL_WORD);
411
412#ifdef ASSERT
413    // make sure exception is set
414    { Label L;
415      __ testptr(rax, rax);
416      __ jcc(Assembler::notEqual, L);
417      __ stop("StubRoutines::forward exception: no pending exception (2)");
418      __ bind(L);
419    }
420#endif
421
422    // continue at exception handler (return address removed)
423    // rax,: exception
424    // rbx,: exception handler
425    // rdx: throwing pc
426    __ verify_oop(rax);
427    __ jmp(rbx);
428
429    return start;
430  }
431
432
433  //----------------------------------------------------------------------------------------------------
434  // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest)
435  //
436  // xchg exists as far back as 8086, lock needed for MP only
437  // Stack layout immediately after call:
438  //
439  // 0 [ret addr ] <--- rsp
440  // 1 [  ex     ]
441  // 2 [  dest   ]
442  //
443  // Result:   *dest <- ex, return (old *dest)
444  //
445  // Note: win32 does not currently use this code
446
447  address generate_atomic_xchg() {
448    StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
449    address start = __ pc();
450
451    __ push(rdx);
452    Address exchange(rsp, 2 * wordSize);
453    Address dest_addr(rsp, 3 * wordSize);
454    __ movl(rax, exchange);
455    __ movptr(rdx, dest_addr);
456    __ xchgl(rax, Address(rdx, 0));
457    __ pop(rdx);
458    __ ret(0);
459
460    return start;
461  }
462
463  //----------------------------------------------------------------------------------------------------
464  // Support for void verify_mxcsr()
465  //
466  // This routine is used with -Xcheck:jni to verify that native
467  // JNI code does not return to Java code without restoring the
468  // MXCSR register to our expected state.
469
470
471  address generate_verify_mxcsr() {
472    StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
473    address start = __ pc();
474
475    const Address mxcsr_save(rsp, 0);
476
477    if (CheckJNICalls && UseSSE > 0 ) {
478      Label ok_ret;
479      ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
480      __ push(rax);
481      __ subptr(rsp, wordSize);      // allocate a temp location
482      __ stmxcsr(mxcsr_save);
483      __ movl(rax, mxcsr_save);
484      __ andl(rax, MXCSR_MASK);
485      __ cmp32(rax, mxcsr_std);
486      __ jcc(Assembler::equal, ok_ret);
487
488      __ warn("MXCSR changed by native JNI code.");
489
490      __ ldmxcsr(mxcsr_std);
491
492      __ bind(ok_ret);
493      __ addptr(rsp, wordSize);
494      __ pop(rax);
495    }
496
497    __ ret(0);
498
499    return start;
500  }
501
502
503  //---------------------------------------------------------------------------
504  // Support for void verify_fpu_cntrl_wrd()
505  //
506  // This routine is used with -Xcheck:jni to verify that native
507  // JNI code does not return to Java code without restoring the
508  // FP control word to our expected state.
509
510  address generate_verify_fpu_cntrl_wrd() {
511    StubCodeMark mark(this, "StubRoutines", "verify_spcw");
512    address start = __ pc();
513
514    const Address fpu_cntrl_wrd_save(rsp, 0);
515
516    if (CheckJNICalls) {
517      Label ok_ret;
518      __ push(rax);
519      __ subptr(rsp, wordSize);      // allocate a temp location
520      __ fnstcw(fpu_cntrl_wrd_save);
521      __ movl(rax, fpu_cntrl_wrd_save);
522      __ andl(rax, FPU_CNTRL_WRD_MASK);
523      ExternalAddress fpu_std(StubRoutines::addr_fpu_cntrl_wrd_std());
524      __ cmp32(rax, fpu_std);
525      __ jcc(Assembler::equal, ok_ret);
526
527      __ warn("Floating point control word changed by native JNI code.");
528
529      __ fldcw(fpu_std);
530
531      __ bind(ok_ret);
532      __ addptr(rsp, wordSize);
533      __ pop(rax);
534    }
535
536    __ ret(0);
537
538    return start;
539  }
540
541  //---------------------------------------------------------------------------
542  // Wrapper for slow-case handling of double-to-integer conversion
543  // d2i or f2i fast case failed either because it is nan or because
544  // of under/overflow.
545  // Input:  FPU TOS: float value
546  // Output: rax, (rdx): integer (long) result
547
548  address generate_d2i_wrapper(BasicType t, address fcn) {
549    StubCodeMark mark(this, "StubRoutines", "d2i_wrapper");
550    address start = __ pc();
551
552  // Capture info about frame layout
553  enum layout { FPUState_off         = 0,
554                rbp_off              = FPUStateSizeInWords,
555                rdi_off,
556                rsi_off,
557                rcx_off,
558                rbx_off,
559                saved_argument_off,
560                saved_argument_off2, // 2nd half of double
561                framesize
562  };
563
564  assert(FPUStateSizeInWords == 27, "update stack layout");
565
566    // Save outgoing argument to stack across push_FPU_state()
567    __ subptr(rsp, wordSize * 2);
568    __ fstp_d(Address(rsp, 0));
569
570    // Save CPU & FPU state
571    __ push(rbx);
572    __ push(rcx);
573    __ push(rsi);
574    __ push(rdi);
575    __ push(rbp);
576    __ push_FPU_state();
577
578    // push_FPU_state() resets the FP top of stack
579    // Load original double into FP top of stack
580    __ fld_d(Address(rsp, saved_argument_off * wordSize));
581    // Store double into stack as outgoing argument
582    __ subptr(rsp, wordSize*2);
583    __ fst_d(Address(rsp, 0));
584
585    // Prepare FPU for doing math in C-land
586    __ empty_FPU_stack();
587    // Call the C code to massage the double.  Result in EAX
588    if (t == T_INT)
589      { BLOCK_COMMENT("SharedRuntime::d2i"); }
590    else if (t == T_LONG)
591      { BLOCK_COMMENT("SharedRuntime::d2l"); }
592    __ call_VM_leaf( fcn, 2 );
593
594    // Restore CPU & FPU state
595    __ pop_FPU_state();
596    __ pop(rbp);
597    __ pop(rdi);
598    __ pop(rsi);
599    __ pop(rcx);
600    __ pop(rbx);
601    __ addptr(rsp, wordSize * 2);
602
603    __ ret(0);
604
605    return start;
606  }
607
608
609  //---------------------------------------------------------------------------
610  // The following routine generates a subroutine to throw an asynchronous
611  // UnknownError when an unsafe access gets a fault that could not be
612  // reasonably prevented by the programmer.  (Example: SIGBUS/OBJERR.)
613  address generate_handler_for_unsafe_access() {
614    StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
615    address start = __ pc();
616
617    __ push(0);                       // hole for return address-to-be
618    __ pusha();                       // push registers
619    Address next_pc(rsp, RegisterImpl::number_of_registers * BytesPerWord);
620    BLOCK_COMMENT("call handle_unsafe_access");
621    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, handle_unsafe_access)));
622    __ movptr(next_pc, rax);          // stuff next address
623    __ popa();
624    __ ret(0);                        // jump to next address
625
626    return start;
627  }
628
629
630  //----------------------------------------------------------------------------------------------------
631  // Non-destructive plausibility checks for oops
632
633  address generate_verify_oop() {
634    StubCodeMark mark(this, "StubRoutines", "verify_oop");
635    address start = __ pc();
636
637    // Incoming arguments on stack after saving rax,:
638    //
639    // [tos    ]: saved rdx
640    // [tos + 1]: saved EFLAGS
641    // [tos + 2]: return address
642    // [tos + 3]: char* error message
643    // [tos + 4]: oop   object to verify
644    // [tos + 5]: saved rax, - saved by caller and bashed
645
646    Label exit, error;
647    __ pushf();
648    __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
649    __ push(rdx);                                // save rdx
650    // make sure object is 'reasonable'
651    __ movptr(rax, Address(rsp, 4 * wordSize));    // get object
652    __ testptr(rax, rax);
653    __ jcc(Assembler::zero, exit);               // if obj is NULL it is ok
654
655    // Check if the oop is in the right area of memory
656    const int oop_mask = Universe::verify_oop_mask();
657    const int oop_bits = Universe::verify_oop_bits();
658    __ mov(rdx, rax);
659    __ andptr(rdx, oop_mask);
660    __ cmpptr(rdx, oop_bits);
661    __ jcc(Assembler::notZero, error);
662
663    // make sure klass is 'reasonable'
664    __ movptr(rax, Address(rax, oopDesc::klass_offset_in_bytes())); // get klass
665    __ testptr(rax, rax);
666    __ jcc(Assembler::zero, error);              // if klass is NULL it is broken
667
668    // Check if the klass is in the right area of memory
669    const int klass_mask = Universe::verify_klass_mask();
670    const int klass_bits = Universe::verify_klass_bits();
671    __ mov(rdx, rax);
672    __ andptr(rdx, klass_mask);
673    __ cmpptr(rdx, klass_bits);
674    __ jcc(Assembler::notZero, error);
675
676    // make sure klass' klass is 'reasonable'
677    __ movptr(rax, Address(rax, oopDesc::klass_offset_in_bytes())); // get klass' klass
678    __ testptr(rax, rax);
679    __ jcc(Assembler::zero, error);              // if klass' klass is NULL it is broken
680
681    __ mov(rdx, rax);
682    __ andptr(rdx, klass_mask);
683    __ cmpptr(rdx, klass_bits);
684    __ jcc(Assembler::notZero, error);           // if klass not in right area
685                                                 // of memory it is broken too.
686
687    // return if everything seems ok
688    __ bind(exit);
689    __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
690    __ pop(rdx);                                 // restore rdx
691    __ popf();                                   // restore EFLAGS
692    __ ret(3 * wordSize);                        // pop arguments
693
694    // handle errors
695    __ bind(error);
696    __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
697    __ pop(rdx);                                 // get saved rdx back
698    __ popf();                                   // get saved EFLAGS off stack -- will be ignored
699    __ pusha();                                  // push registers (eip = return address & msg are already pushed)
700    BLOCK_COMMENT("call MacroAssembler::debug");
701    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
702    __ popa();
703    __ ret(3 * wordSize);                        // pop arguments
704    return start;
705  }
706
707  //
708  //  Generate pre-barrier for array stores
709  //
710  //  Input:
711  //     start   -  starting address
712  //     end     -  element count
713  void  gen_write_ref_array_pre_barrier(Register start, Register count) {
714    assert_different_registers(start, count);
715    BarrierSet* bs = Universe::heap()->barrier_set();
716    switch (bs->kind()) {
717      case BarrierSet::G1SATBCT:
718      case BarrierSet::G1SATBCTLogging:
719        {
720          __ pusha();                      // push registers
721          __ push(count);
722          __ push(start);
723          __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre)));
724          __ addptr(rsp, 2*wordSize);
725          __ popa();
726        }
727        break;
728      case BarrierSet::CardTableModRef:
729      case BarrierSet::CardTableExtension:
730      case BarrierSet::ModRef:
731        break;
732      default      :
733        ShouldNotReachHere();
734
735    }
736  }
737
738
739  //
740  // Generate a post-barrier for an array store
741  //
742  //     start    -  starting address
743  //     count    -  element count
744  //
745  //  The two input registers are overwritten.
746  //
747  void  gen_write_ref_array_post_barrier(Register start, Register count) {
748    BarrierSet* bs = Universe::heap()->barrier_set();
749    assert_different_registers(start, count);
750    switch (bs->kind()) {
751      case BarrierSet::G1SATBCT:
752      case BarrierSet::G1SATBCTLogging:
753        {
754          __ pusha();                      // push registers
755          __ push(count);
756          __ push(start);
757          __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post)));
758          __ addptr(rsp, 2*wordSize);
759          __ popa();
760
761        }
762        break;
763
764      case BarrierSet::CardTableModRef:
765      case BarrierSet::CardTableExtension:
766        {
767          CardTableModRefBS* ct = (CardTableModRefBS*)bs;
768          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
769
770          Label L_loop;
771          const Register end = count;  // elements count; end == start+count-1
772          assert_different_registers(start, end);
773
774          __ lea(end,  Address(start, count, Address::times_ptr, -wordSize));
775          __ shrptr(start, CardTableModRefBS::card_shift);
776          __ shrptr(end,   CardTableModRefBS::card_shift);
777          __ subptr(end, start); // end --> count
778        __ BIND(L_loop);
779          intptr_t disp = (intptr_t) ct->byte_map_base;
780          Address cardtable(start, count, Address::times_1, disp);
781          __ movb(cardtable, 0);
782          __ decrement(count);
783          __ jcc(Assembler::greaterEqual, L_loop);
784        }
785        break;
786      case BarrierSet::ModRef:
787        break;
788      default      :
789        ShouldNotReachHere();
790
791    }
792  }
793
794
795  // Copy 64 bytes chunks
796  //
797  // Inputs:
798  //   from        - source array address
799  //   to_from     - destination array address - from
800  //   qword_count - 8-bytes element count, negative
801  //
802  void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
803    assert( UseSSE >= 2, "supported cpu only" );
804    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
805    // Copy 64-byte chunks
806    __ jmpb(L_copy_64_bytes);
807    __ align(16);
808  __ BIND(L_copy_64_bytes_loop);
809
810    if(UseUnalignedLoadStores) {
811      __ movdqu(xmm0, Address(from, 0));
812      __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
813      __ movdqu(xmm1, Address(from, 16));
814      __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
815      __ movdqu(xmm2, Address(from, 32));
816      __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
817      __ movdqu(xmm3, Address(from, 48));
818      __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
819
820    } else {
821      __ movq(xmm0, Address(from, 0));
822      __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
823      __ movq(xmm1, Address(from, 8));
824      __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
825      __ movq(xmm2, Address(from, 16));
826      __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
827      __ movq(xmm3, Address(from, 24));
828      __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
829      __ movq(xmm4, Address(from, 32));
830      __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
831      __ movq(xmm5, Address(from, 40));
832      __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
833      __ movq(xmm6, Address(from, 48));
834      __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
835      __ movq(xmm7, Address(from, 56));
836      __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
837    }
838
839    __ addl(from, 64);
840  __ BIND(L_copy_64_bytes);
841    __ subl(qword_count, 8);
842    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
843    __ addl(qword_count, 8);
844    __ jccb(Assembler::zero, L_exit);
845    //
846    // length is too short, just copy qwords
847    //
848  __ BIND(L_copy_8_bytes);
849    __ movq(xmm0, Address(from, 0));
850    __ movq(Address(from, to_from, Address::times_1), xmm0);
851    __ addl(from, 8);
852    __ decrement(qword_count);
853    __ jcc(Assembler::greater, L_copy_8_bytes);
854  __ BIND(L_exit);
855  }
856
857  // Copy 64 bytes chunks
858  //
859  // Inputs:
860  //   from        - source array address
861  //   to_from     - destination array address - from
862  //   qword_count - 8-bytes element count, negative
863  //
864  void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
865    assert( VM_Version::supports_mmx(), "supported cpu only" );
866    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
867    // Copy 64-byte chunks
868    __ jmpb(L_copy_64_bytes);
869    __ align(16);
870  __ BIND(L_copy_64_bytes_loop);
871    __ movq(mmx0, Address(from, 0));
872    __ movq(mmx1, Address(from, 8));
873    __ movq(mmx2, Address(from, 16));
874    __ movq(Address(from, to_from, Address::times_1, 0), mmx0);
875    __ movq(mmx3, Address(from, 24));
876    __ movq(Address(from, to_from, Address::times_1, 8), mmx1);
877    __ movq(mmx4, Address(from, 32));
878    __ movq(Address(from, to_from, Address::times_1, 16), mmx2);
879    __ movq(mmx5, Address(from, 40));
880    __ movq(Address(from, to_from, Address::times_1, 24), mmx3);
881    __ movq(mmx6, Address(from, 48));
882    __ movq(Address(from, to_from, Address::times_1, 32), mmx4);
883    __ movq(mmx7, Address(from, 56));
884    __ movq(Address(from, to_from, Address::times_1, 40), mmx5);
885    __ movq(Address(from, to_from, Address::times_1, 48), mmx6);
886    __ movq(Address(from, to_from, Address::times_1, 56), mmx7);
887    __ addptr(from, 64);
888  __ BIND(L_copy_64_bytes);
889    __ subl(qword_count, 8);
890    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
891    __ addl(qword_count, 8);
892    __ jccb(Assembler::zero, L_exit);
893    //
894    // length is too short, just copy qwords
895    //
896  __ BIND(L_copy_8_bytes);
897    __ movq(mmx0, Address(from, 0));
898    __ movq(Address(from, to_from, Address::times_1), mmx0);
899    __ addptr(from, 8);
900    __ decrement(qword_count);
901    __ jcc(Assembler::greater, L_copy_8_bytes);
902  __ BIND(L_exit);
903    __ emms();
904  }
905
906  address generate_disjoint_copy(BasicType t, bool aligned,
907                                 Address::ScaleFactor sf,
908                                 address* entry, const char *name) {
909    __ align(CodeEntryAlignment);
910    StubCodeMark mark(this, "StubRoutines", name);
911    address start = __ pc();
912
913    Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
914    Label L_copy_2_bytes, L_copy_4_bytes, L_copy_64_bytes;
915
916    int shift = Address::times_ptr - sf;
917
918    const Register from     = rsi;  // source array address
919    const Register to       = rdi;  // destination array address
920    const Register count    = rcx;  // elements count
921    const Register to_from  = to;   // (to - from)
922    const Register saved_to = rdx;  // saved destination array address
923
924    __ enter(); // required for proper stackwalking of RuntimeStub frame
925    __ push(rsi);
926    __ push(rdi);
927    __ movptr(from , Address(rsp, 12+ 4));
928    __ movptr(to   , Address(rsp, 12+ 8));
929    __ movl(count, Address(rsp, 12+ 12));
930    if (t == T_OBJECT) {
931      __ testl(count, count);
932      __ jcc(Assembler::zero, L_0_count);
933      gen_write_ref_array_pre_barrier(to, count);
934      __ mov(saved_to, to);          // save 'to'
935    }
936
937    *entry = __ pc(); // Entry point from conjoint arraycopy stub.
938    BLOCK_COMMENT("Entry:");
939
940    __ subptr(to, from); // to --> to_from
941    __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
942    __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
943    if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
944      // align source address at 4 bytes address boundary
945      if (t == T_BYTE) {
946        // One byte misalignment happens only for byte arrays
947        __ testl(from, 1);
948        __ jccb(Assembler::zero, L_skip_align1);
949        __ movb(rax, Address(from, 0));
950        __ movb(Address(from, to_from, Address::times_1, 0), rax);
951        __ increment(from);
952        __ decrement(count);
953      __ BIND(L_skip_align1);
954      }
955      // Two bytes misalignment happens only for byte and short (char) arrays
956      __ testl(from, 2);
957      __ jccb(Assembler::zero, L_skip_align2);
958      __ movw(rax, Address(from, 0));
959      __ movw(Address(from, to_from, Address::times_1, 0), rax);
960      __ addptr(from, 2);
961      __ subl(count, 1<<(shift-1));
962    __ BIND(L_skip_align2);
963    }
964    if (!VM_Version::supports_mmx()) {
965      __ mov(rax, count);      // save 'count'
966      __ shrl(count, shift); // bytes count
967      __ addptr(to_from, from);// restore 'to'
968      __ rep_mov();
969      __ subptr(to_from, from);// restore 'to_from'
970      __ mov(count, rax);      // restore 'count'
971      __ jmpb(L_copy_2_bytes); // all dwords were copied
972    } else {
973      if (!UseUnalignedLoadStores) {
974        // align to 8 bytes, we know we are 4 byte aligned to start
975        __ testptr(from, 4);
976        __ jccb(Assembler::zero, L_copy_64_bytes);
977        __ movl(rax, Address(from, 0));
978        __ movl(Address(from, to_from, Address::times_1, 0), rax);
979        __ addptr(from, 4);
980        __ subl(count, 1<<shift);
981      }
982    __ BIND(L_copy_64_bytes);
983      __ mov(rax, count);
984      __ shrl(rax, shift+1);  // 8 bytes chunk count
985      //
986      // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop
987      //
988      if (UseXMMForArrayCopy) {
989        xmm_copy_forward(from, to_from, rax);
990      } else {
991        mmx_copy_forward(from, to_from, rax);
992      }
993    }
994    // copy tailing dword
995  __ BIND(L_copy_4_bytes);
996    __ testl(count, 1<<shift);
997    __ jccb(Assembler::zero, L_copy_2_bytes);
998    __ movl(rax, Address(from, 0));
999    __ movl(Address(from, to_from, Address::times_1, 0), rax);
1000    if (t == T_BYTE || t == T_SHORT) {
1001      __ addptr(from, 4);
1002    __ BIND(L_copy_2_bytes);
1003      // copy tailing word
1004      __ testl(count, 1<<(shift-1));
1005      __ jccb(Assembler::zero, L_copy_byte);
1006      __ movw(rax, Address(from, 0));
1007      __ movw(Address(from, to_from, Address::times_1, 0), rax);
1008      if (t == T_BYTE) {
1009        __ addptr(from, 2);
1010      __ BIND(L_copy_byte);
1011        // copy tailing byte
1012        __ testl(count, 1);
1013        __ jccb(Assembler::zero, L_exit);
1014        __ movb(rax, Address(from, 0));
1015        __ movb(Address(from, to_from, Address::times_1, 0), rax);
1016      __ BIND(L_exit);
1017      } else {
1018      __ BIND(L_copy_byte);
1019      }
1020    } else {
1021    __ BIND(L_copy_2_bytes);
1022    }
1023
1024    if (t == T_OBJECT) {
1025      __ movl(count, Address(rsp, 12+12)); // reread 'count'
1026      __ mov(to, saved_to); // restore 'to'
1027      gen_write_ref_array_post_barrier(to, count);
1028    __ BIND(L_0_count);
1029    }
1030    inc_copy_counter_np(t);
1031    __ pop(rdi);
1032    __ pop(rsi);
1033    __ leave(); // required for proper stackwalking of RuntimeStub frame
1034    __ xorptr(rax, rax); // return 0
1035    __ ret(0);
1036    return start;
1037  }
1038
1039
1040  address generate_conjoint_copy(BasicType t, bool aligned,
1041                                 Address::ScaleFactor sf,
1042                                 address nooverlap_target,
1043                                 address* entry, const char *name) {
1044    __ align(CodeEntryAlignment);
1045    StubCodeMark mark(this, "StubRoutines", name);
1046    address start = __ pc();
1047
1048    Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
1049    Label L_copy_2_bytes, L_copy_4_bytes, L_copy_8_bytes, L_copy_8_bytes_loop;
1050
1051    int shift = Address::times_ptr - sf;
1052
1053    const Register src   = rax;  // source array address
1054    const Register dst   = rdx;  // destination array address
1055    const Register from  = rsi;  // source array address
1056    const Register to    = rdi;  // destination array address
1057    const Register count = rcx;  // elements count
1058    const Register end   = rax;  // array end address
1059
1060    __ enter(); // required for proper stackwalking of RuntimeStub frame
1061    __ push(rsi);
1062    __ push(rdi);
1063    __ movptr(src  , Address(rsp, 12+ 4));   // from
1064    __ movptr(dst  , Address(rsp, 12+ 8));   // to
1065    __ movl2ptr(count, Address(rsp, 12+12)); // count
1066    if (t == T_OBJECT) {
1067       gen_write_ref_array_pre_barrier(dst, count);
1068    }
1069
1070    if (entry != NULL) {
1071      *entry = __ pc(); // Entry point from generic arraycopy stub.
1072      BLOCK_COMMENT("Entry:");
1073    }
1074
1075    if (t == T_OBJECT) {
1076      __ testl(count, count);
1077      __ jcc(Assembler::zero, L_0_count);
1078    }
1079    __ mov(from, src);
1080    __ mov(to  , dst);
1081
1082    // arrays overlap test
1083    RuntimeAddress nooverlap(nooverlap_target);
1084    __ cmpptr(dst, src);
1085    __ lea(end, Address(src, count, sf, 0)); // src + count * elem_size
1086    __ jump_cc(Assembler::belowEqual, nooverlap);
1087    __ cmpptr(dst, end);
1088    __ jump_cc(Assembler::aboveEqual, nooverlap);
1089
1090    // copy from high to low
1091    __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1092    __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
1093    if (t == T_BYTE || t == T_SHORT) {
1094      // Align the end of destination array at 4 bytes address boundary
1095      __ lea(end, Address(dst, count, sf, 0));
1096      if (t == T_BYTE) {
1097        // One byte misalignment happens only for byte arrays
1098        __ testl(end, 1);
1099        __ jccb(Assembler::zero, L_skip_align1);
1100        __ decrement(count);
1101        __ movb(rdx, Address(from, count, sf, 0));
1102        __ movb(Address(to, count, sf, 0), rdx);
1103      __ BIND(L_skip_align1);
1104      }
1105      // Two bytes misalignment happens only for byte and short (char) arrays
1106      __ testl(end, 2);
1107      __ jccb(Assembler::zero, L_skip_align2);
1108      __ subptr(count, 1<<(shift-1));
1109      __ movw(rdx, Address(from, count, sf, 0));
1110      __ movw(Address(to, count, sf, 0), rdx);
1111    __ BIND(L_skip_align2);
1112      __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1113      __ jcc(Assembler::below, L_copy_4_bytes);
1114    }
1115
1116    if (!VM_Version::supports_mmx()) {
1117      __ std();
1118      __ mov(rax, count); // Save 'count'
1119      __ mov(rdx, to);    // Save 'to'
1120      __ lea(rsi, Address(from, count, sf, -4));
1121      __ lea(rdi, Address(to  , count, sf, -4));
1122      __ shrptr(count, shift); // bytes count
1123      __ rep_mov();
1124      __ cld();
1125      __ mov(count, rax); // restore 'count'
1126      __ andl(count, (1<<shift)-1);      // mask the number of rest elements
1127      __ movptr(from, Address(rsp, 12+4)); // reread 'from'
1128      __ mov(to, rdx);   // restore 'to'
1129      __ jmpb(L_copy_2_bytes); // all dword were copied
1130   } else {
1131      // Align to 8 bytes the end of array. It is aligned to 4 bytes already.
1132      __ testptr(end, 4);
1133      __ jccb(Assembler::zero, L_copy_8_bytes);
1134      __ subl(count, 1<<shift);
1135      __ movl(rdx, Address(from, count, sf, 0));
1136      __ movl(Address(to, count, sf, 0), rdx);
1137      __ jmpb(L_copy_8_bytes);
1138
1139      __ align(16);
1140      // Move 8 bytes
1141    __ BIND(L_copy_8_bytes_loop);
1142      if (UseXMMForArrayCopy) {
1143        __ movq(xmm0, Address(from, count, sf, 0));
1144        __ movq(Address(to, count, sf, 0), xmm0);
1145      } else {
1146        __ movq(mmx0, Address(from, count, sf, 0));
1147        __ movq(Address(to, count, sf, 0), mmx0);
1148      }
1149    __ BIND(L_copy_8_bytes);
1150      __ subl(count, 2<<shift);
1151      __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1152      __ addl(count, 2<<shift);
1153      if (!UseXMMForArrayCopy) {
1154        __ emms();
1155      }
1156    }
1157  __ BIND(L_copy_4_bytes);
1158    // copy prefix qword
1159    __ testl(count, 1<<shift);
1160    __ jccb(Assembler::zero, L_copy_2_bytes);
1161    __ movl(rdx, Address(from, count, sf, -4));
1162    __ movl(Address(to, count, sf, -4), rdx);
1163
1164    if (t == T_BYTE || t == T_SHORT) {
1165        __ subl(count, (1<<shift));
1166      __ BIND(L_copy_2_bytes);
1167        // copy prefix dword
1168        __ testl(count, 1<<(shift-1));
1169        __ jccb(Assembler::zero, L_copy_byte);
1170        __ movw(rdx, Address(from, count, sf, -2));
1171        __ movw(Address(to, count, sf, -2), rdx);
1172        if (t == T_BYTE) {
1173          __ subl(count, 1<<(shift-1));
1174        __ BIND(L_copy_byte);
1175          // copy prefix byte
1176          __ testl(count, 1);
1177          __ jccb(Assembler::zero, L_exit);
1178          __ movb(rdx, Address(from, 0));
1179          __ movb(Address(to, 0), rdx);
1180        __ BIND(L_exit);
1181        } else {
1182        __ BIND(L_copy_byte);
1183        }
1184    } else {
1185    __ BIND(L_copy_2_bytes);
1186    }
1187    if (t == T_OBJECT) {
1188      __ movl2ptr(count, Address(rsp, 12+12)); // reread count
1189      gen_write_ref_array_post_barrier(to, count);
1190    __ BIND(L_0_count);
1191    }
1192    inc_copy_counter_np(t);
1193    __ pop(rdi);
1194    __ pop(rsi);
1195    __ leave(); // required for proper stackwalking of RuntimeStub frame
1196    __ xorptr(rax, rax); // return 0
1197    __ ret(0);
1198    return start;
1199  }
1200
1201
1202  address generate_disjoint_long_copy(address* entry, const char *name) {
1203    __ align(CodeEntryAlignment);
1204    StubCodeMark mark(this, "StubRoutines", name);
1205    address start = __ pc();
1206
1207    Label L_copy_8_bytes, L_copy_8_bytes_loop;
1208    const Register from       = rax;  // source array address
1209    const Register to         = rdx;  // destination array address
1210    const Register count      = rcx;  // elements count
1211    const Register to_from    = rdx;  // (to - from)
1212
1213    __ enter(); // required for proper stackwalking of RuntimeStub frame
1214    __ movptr(from , Address(rsp, 8+0));       // from
1215    __ movptr(to   , Address(rsp, 8+4));       // to
1216    __ movl2ptr(count, Address(rsp, 8+8));     // count
1217
1218    *entry = __ pc(); // Entry point from conjoint arraycopy stub.
1219    BLOCK_COMMENT("Entry:");
1220
1221    __ subptr(to, from); // to --> to_from
1222    if (VM_Version::supports_mmx()) {
1223      if (UseXMMForArrayCopy) {
1224        xmm_copy_forward(from, to_from, count);
1225      } else {
1226        mmx_copy_forward(from, to_from, count);
1227      }
1228    } else {
1229      __ jmpb(L_copy_8_bytes);
1230      __ align(16);
1231    __ BIND(L_copy_8_bytes_loop);
1232      __ fild_d(Address(from, 0));
1233      __ fistp_d(Address(from, to_from, Address::times_1));
1234      __ addptr(from, 8);
1235    __ BIND(L_copy_8_bytes);
1236      __ decrement(count);
1237      __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1238    }
1239    inc_copy_counter_np(T_LONG);
1240    __ leave(); // required for proper stackwalking of RuntimeStub frame
1241    __ xorptr(rax, rax); // return 0
1242    __ ret(0);
1243    return start;
1244  }
1245
1246  address generate_conjoint_long_copy(address nooverlap_target,
1247                                      address* entry, const char *name) {
1248    __ align(CodeEntryAlignment);
1249    StubCodeMark mark(this, "StubRoutines", name);
1250    address start = __ pc();
1251
1252    Label L_copy_8_bytes, L_copy_8_bytes_loop;
1253    const Register from       = rax;  // source array address
1254    const Register to         = rdx;  // destination array address
1255    const Register count      = rcx;  // elements count
1256    const Register end_from   = rax;  // source array end address
1257
1258    __ enter(); // required for proper stackwalking of RuntimeStub frame
1259    __ movptr(from , Address(rsp, 8+0));       // from
1260    __ movptr(to   , Address(rsp, 8+4));       // to
1261    __ movl2ptr(count, Address(rsp, 8+8));     // count
1262
1263    *entry = __ pc(); // Entry point from generic arraycopy stub.
1264    BLOCK_COMMENT("Entry:");
1265
1266    // arrays overlap test
1267    __ cmpptr(to, from);
1268    RuntimeAddress nooverlap(nooverlap_target);
1269    __ jump_cc(Assembler::belowEqual, nooverlap);
1270    __ lea(end_from, Address(from, count, Address::times_8, 0));
1271    __ cmpptr(to, end_from);
1272    __ movptr(from, Address(rsp, 8));  // from
1273    __ jump_cc(Assembler::aboveEqual, nooverlap);
1274
1275    __ jmpb(L_copy_8_bytes);
1276
1277    __ align(16);
1278  __ BIND(L_copy_8_bytes_loop);
1279    if (VM_Version::supports_mmx()) {
1280      if (UseXMMForArrayCopy) {
1281        __ movq(xmm0, Address(from, count, Address::times_8));
1282        __ movq(Address(to, count, Address::times_8), xmm0);
1283      } else {
1284        __ movq(mmx0, Address(from, count, Address::times_8));
1285        __ movq(Address(to, count, Address::times_8), mmx0);
1286      }
1287    } else {
1288      __ fild_d(Address(from, count, Address::times_8));
1289      __ fistp_d(Address(to, count, Address::times_8));
1290    }
1291  __ BIND(L_copy_8_bytes);
1292    __ decrement(count);
1293    __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1294
1295    if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
1296      __ emms();
1297    }
1298    inc_copy_counter_np(T_LONG);
1299    __ leave(); // required for proper stackwalking of RuntimeStub frame
1300    __ xorptr(rax, rax); // return 0
1301    __ ret(0);
1302    return start;
1303  }
1304
1305
1306  // Helper for generating a dynamic type check.
1307  // The sub_klass must be one of {rbx, rdx, rsi}.
1308  // The temp is killed.
1309  void generate_type_check(Register sub_klass,
1310                           Address& super_check_offset_addr,
1311                           Address& super_klass_addr,
1312                           Register temp,
1313                           Label* L_success_ptr, Label* L_failure_ptr) {
1314    BLOCK_COMMENT("type_check:");
1315
1316    Label L_fallthrough;
1317    bool fall_through_on_success = (L_success_ptr == NULL);
1318    if (fall_through_on_success) {
1319      L_success_ptr = &L_fallthrough;
1320    } else {
1321      L_failure_ptr = &L_fallthrough;
1322    }
1323    Label& L_success = *L_success_ptr;
1324    Label& L_failure = *L_failure_ptr;
1325
1326    assert_different_registers(sub_klass, temp);
1327
1328    // a couple of useful fields in sub_klass:
1329    int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
1330                     Klass::secondary_supers_offset_in_bytes());
1331    int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
1332                     Klass::secondary_super_cache_offset_in_bytes());
1333    Address secondary_supers_addr(sub_klass, ss_offset);
1334    Address super_cache_addr(     sub_klass, sc_offset);
1335
1336    // if the pointers are equal, we are done (e.g., String[] elements)
1337    __ cmpptr(sub_klass, super_klass_addr);
1338    __ jcc(Assembler::equal, L_success);
1339
1340    // check the supertype display:
1341    __ movl2ptr(temp, super_check_offset_addr);
1342    Address super_check_addr(sub_klass, temp, Address::times_1, 0);
1343    __ movptr(temp, super_check_addr); // load displayed supertype
1344    __ cmpptr(temp, super_klass_addr); // test the super type
1345    __ jcc(Assembler::equal, L_success);
1346
1347    // if it was a primary super, we can just fail immediately
1348    __ cmpl(super_check_offset_addr, sc_offset);
1349    __ jcc(Assembler::notEqual, L_failure);
1350
1351    // Now do a linear scan of the secondary super-klass chain.
1352    // This code is rarely used, so simplicity is a virtue here.
1353    inc_counter_np(SharedRuntime::_partial_subtype_ctr);
1354    {
1355      // The repne_scan instruction uses fixed registers, which we must spill.
1356      // (We need a couple more temps in any case.)
1357      __ push(rax);
1358      __ push(rcx);
1359      __ push(rdi);
1360      assert_different_registers(sub_klass, rax, rcx, rdi);
1361
1362      __ movptr(rdi, secondary_supers_addr);
1363      // Load the array length.
1364      __ movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
1365      // Skip to start of data.
1366      __ addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1367      // Scan rcx words at [edi] for occurance of rax,
1368      // Set NZ/Z based on last compare
1369      __ movptr(rax, super_klass_addr);
1370      __ repne_scan();
1371
1372      // Unspill the temp. registers:
1373      __ pop(rdi);
1374      __ pop(rcx);
1375      __ pop(rax);
1376    }
1377    __ jcc(Assembler::notEqual, L_failure);
1378
1379    // Success.  Cache the super we found and proceed in triumph.
1380    __ movptr(temp, super_klass_addr); // note: rax, is dead
1381    __ movptr(super_cache_addr, temp);
1382
1383    if (!fall_through_on_success)
1384      __ jmp(L_success);
1385
1386    // Fall through on failure!
1387    __ bind(L_fallthrough);
1388  }
1389
1390  //
1391  //  Generate checkcasting array copy stub
1392  //
1393  //  Input:
1394  //    4(rsp)   - source array address
1395  //    8(rsp)   - destination array address
1396  //   12(rsp)   - element count, can be zero
1397  //   16(rsp)   - size_t ckoff (super_check_offset)
1398  //   20(rsp)   - oop ckval (super_klass)
1399  //
1400  //  Output:
1401  //    rax, ==  0  -  success
1402  //    rax, == -1^K - failure, where K is partial transfer count
1403  //
1404  address generate_checkcast_copy(const char *name, address* entry) {
1405    __ align(CodeEntryAlignment);
1406    StubCodeMark mark(this, "StubRoutines", name);
1407    address start = __ pc();
1408
1409    Label L_load_element, L_store_element, L_do_card_marks, L_done;
1410
1411    // register use:
1412    //  rax, rdx, rcx -- loop control (end_from, end_to, count)
1413    //  rdi, rsi      -- element access (oop, klass)
1414    //  rbx,           -- temp
1415    const Register from       = rax;    // source array address
1416    const Register to         = rdx;    // destination array address
1417    const Register length     = rcx;    // elements count
1418    const Register elem       = rdi;    // each oop copied
1419    const Register elem_klass = rsi;    // each elem._klass (sub_klass)
1420    const Register temp       = rbx;    // lone remaining temp
1421
1422    __ enter(); // required for proper stackwalking of RuntimeStub frame
1423
1424    __ push(rsi);
1425    __ push(rdi);
1426    __ push(rbx);
1427
1428    Address   from_arg(rsp, 16+ 4);     // from
1429    Address     to_arg(rsp, 16+ 8);     // to
1430    Address length_arg(rsp, 16+12);     // elements count
1431    Address  ckoff_arg(rsp, 16+16);     // super_check_offset
1432    Address  ckval_arg(rsp, 16+20);     // super_klass
1433
1434    // Load up:
1435    __ movptr(from,     from_arg);
1436    __ movptr(to,         to_arg);
1437    __ movl2ptr(length, length_arg);
1438
1439    *entry = __ pc(); // Entry point from generic arraycopy stub.
1440    BLOCK_COMMENT("Entry:");
1441
1442    //---------------------------------------------------------------
1443    // Assembler stub will be used for this call to arraycopy
1444    // if the two arrays are subtypes of Object[] but the
1445    // destination array type is not equal to or a supertype
1446    // of the source type.  Each element must be separately
1447    // checked.
1448
1449    // Loop-invariant addresses.  They are exclusive end pointers.
1450    Address end_from_addr(from, length, Address::times_ptr, 0);
1451    Address   end_to_addr(to,   length, Address::times_ptr, 0);
1452
1453    Register end_from = from;           // re-use
1454    Register end_to   = to;             // re-use
1455    Register count    = length;         // re-use
1456
1457    // Loop-variant addresses.  They assume post-incremented count < 0.
1458    Address from_element_addr(end_from, count, Address::times_ptr, 0);
1459    Address   to_element_addr(end_to,   count, Address::times_ptr, 0);
1460    Address elem_klass_addr(elem, oopDesc::klass_offset_in_bytes());
1461
1462    // Copy from low to high addresses, indexed from the end of each array.
1463    gen_write_ref_array_pre_barrier(to, count);
1464    __ lea(end_from, end_from_addr);
1465    __ lea(end_to,   end_to_addr);
1466    assert(length == count, "");        // else fix next line:
1467    __ negptr(count);                   // negate and test the length
1468    __ jccb(Assembler::notZero, L_load_element);
1469
1470    // Empty array:  Nothing to do.
1471    __ xorptr(rax, rax);                  // return 0 on (trivial) success
1472    __ jmp(L_done);
1473
1474    // ======== begin loop ========
1475    // (Loop is rotated; its entry is L_load_element.)
1476    // Loop control:
1477    //   for (count = -count; count != 0; count++)
1478    // Base pointers src, dst are biased by 8*count,to last element.
1479    __ align(16);
1480
1481    __ BIND(L_store_element);
1482    __ movptr(to_element_addr, elem);     // store the oop
1483    __ increment(count);                // increment the count toward zero
1484    __ jccb(Assembler::zero, L_do_card_marks);
1485
1486    // ======== loop entry is here ========
1487    __ BIND(L_load_element);
1488    __ movptr(elem, from_element_addr);   // load the oop
1489    __ testptr(elem, elem);
1490    __ jccb(Assembler::zero, L_store_element);
1491
1492    // (Could do a trick here:  Remember last successful non-null
1493    // element stored and make a quick oop equality check on it.)
1494
1495    __ movptr(elem_klass, elem_klass_addr); // query the object klass
1496    generate_type_check(elem_klass, ckoff_arg, ckval_arg, temp,
1497                        &L_store_element, NULL);
1498      // (On fall-through, we have failed the element type check.)
1499    // ======== end loop ========
1500
1501    // It was a real error; we must depend on the caller to finish the job.
1502    // Register "count" = -1 * number of *remaining* oops, length_arg = *total* oops.
1503    // Emit GC store barriers for the oops we have copied (length_arg + count),
1504    // and report their number to the caller.
1505    __ addl(count, length_arg);         // transfers = (length - remaining)
1506    __ movl2ptr(rax, count);            // save the value
1507    __ notptr(rax);                     // report (-1^K) to caller
1508    __ movptr(to, to_arg);              // reload
1509    assert_different_registers(to, count, rax);
1510    gen_write_ref_array_post_barrier(to, count);
1511    __ jmpb(L_done);
1512
1513    // Come here on success only.
1514    __ BIND(L_do_card_marks);
1515    __ movl2ptr(count, length_arg);
1516    __ movptr(to, to_arg);                // reload
1517    gen_write_ref_array_post_barrier(to, count);
1518    __ xorptr(rax, rax);                  // return 0 on success
1519
1520    // Common exit point (success or failure).
1521    __ BIND(L_done);
1522    __ pop(rbx);
1523    __ pop(rdi);
1524    __ pop(rsi);
1525    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1526    __ leave(); // required for proper stackwalking of RuntimeStub frame
1527    __ ret(0);
1528
1529    return start;
1530  }
1531
1532  //
1533  //  Generate 'unsafe' array copy stub
1534  //  Though just as safe as the other stubs, it takes an unscaled
1535  //  size_t argument instead of an element count.
1536  //
1537  //  Input:
1538  //    4(rsp)   - source array address
1539  //    8(rsp)   - destination array address
1540  //   12(rsp)   - byte count, can be zero
1541  //
1542  //  Output:
1543  //    rax, ==  0  -  success
1544  //    rax, == -1  -  need to call System.arraycopy
1545  //
1546  // Examines the alignment of the operands and dispatches
1547  // to a long, int, short, or byte copy loop.
1548  //
1549  address generate_unsafe_copy(const char *name,
1550                               address byte_copy_entry,
1551                               address short_copy_entry,
1552                               address int_copy_entry,
1553                               address long_copy_entry) {
1554
1555    Label L_long_aligned, L_int_aligned, L_short_aligned;
1556
1557    __ align(CodeEntryAlignment);
1558    StubCodeMark mark(this, "StubRoutines", name);
1559    address start = __ pc();
1560
1561    const Register from       = rax;  // source array address
1562    const Register to         = rdx;  // destination array address
1563    const Register count      = rcx;  // elements count
1564
1565    __ enter(); // required for proper stackwalking of RuntimeStub frame
1566    __ push(rsi);
1567    __ push(rdi);
1568    Address  from_arg(rsp, 12+ 4);      // from
1569    Address    to_arg(rsp, 12+ 8);      // to
1570    Address count_arg(rsp, 12+12);      // byte count
1571
1572    // Load up:
1573    __ movptr(from ,  from_arg);
1574    __ movptr(to   ,    to_arg);
1575    __ movl2ptr(count, count_arg);
1576
1577    // bump this on entry, not on exit:
1578    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1579
1580    const Register bits = rsi;
1581    __ mov(bits, from);
1582    __ orptr(bits, to);
1583    __ orptr(bits, count);
1584
1585    __ testl(bits, BytesPerLong-1);
1586    __ jccb(Assembler::zero, L_long_aligned);
1587
1588    __ testl(bits, BytesPerInt-1);
1589    __ jccb(Assembler::zero, L_int_aligned);
1590
1591    __ testl(bits, BytesPerShort-1);
1592    __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
1593
1594    __ BIND(L_short_aligned);
1595    __ shrptr(count, LogBytesPerShort); // size => short_count
1596    __ movl(count_arg, count);          // update 'count'
1597    __ jump(RuntimeAddress(short_copy_entry));
1598
1599    __ BIND(L_int_aligned);
1600    __ shrptr(count, LogBytesPerInt); // size => int_count
1601    __ movl(count_arg, count);          // update 'count'
1602    __ jump(RuntimeAddress(int_copy_entry));
1603
1604    __ BIND(L_long_aligned);
1605    __ shrptr(count, LogBytesPerLong); // size => qword_count
1606    __ movl(count_arg, count);          // update 'count'
1607    __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1608    __ pop(rsi);
1609    __ jump(RuntimeAddress(long_copy_entry));
1610
1611    return start;
1612  }
1613
1614
1615  // Perform range checks on the proposed arraycopy.
1616  // Smashes src_pos and dst_pos.  (Uses them up for temps.)
1617  void arraycopy_range_checks(Register src,
1618                              Register src_pos,
1619                              Register dst,
1620                              Register dst_pos,
1621                              Address& length,
1622                              Label& L_failed) {
1623    BLOCK_COMMENT("arraycopy_range_checks:");
1624    const Register src_end = src_pos;   // source array end position
1625    const Register dst_end = dst_pos;   // destination array end position
1626    __ addl(src_end, length); // src_pos + length
1627    __ addl(dst_end, length); // dst_pos + length
1628
1629    //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
1630    __ cmpl(src_end, Address(src, arrayOopDesc::length_offset_in_bytes()));
1631    __ jcc(Assembler::above, L_failed);
1632
1633    //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
1634    __ cmpl(dst_end, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1635    __ jcc(Assembler::above, L_failed);
1636
1637    BLOCK_COMMENT("arraycopy_range_checks done");
1638  }
1639
1640
1641  //
1642  //  Generate generic array copy stubs
1643  //
1644  //  Input:
1645  //     4(rsp)    -  src oop
1646  //     8(rsp)    -  src_pos
1647  //    12(rsp)    -  dst oop
1648  //    16(rsp)    -  dst_pos
1649  //    20(rsp)    -  element count
1650  //
1651  //  Output:
1652  //    rax, ==  0  -  success
1653  //    rax, == -1^K - failure, where K is partial transfer count
1654  //
1655  address generate_generic_copy(const char *name,
1656                                address entry_jbyte_arraycopy,
1657                                address entry_jshort_arraycopy,
1658                                address entry_jint_arraycopy,
1659                                address entry_oop_arraycopy,
1660                                address entry_jlong_arraycopy,
1661                                address entry_checkcast_arraycopy) {
1662    Label L_failed, L_failed_0, L_objArray;
1663
1664    { int modulus = CodeEntryAlignment;
1665      int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
1666      int advance = target - (__ offset() % modulus);
1667      if (advance < 0)  advance += modulus;
1668      if (advance > 0)  __ nop(advance);
1669    }
1670    StubCodeMark mark(this, "StubRoutines", name);
1671
1672    // Short-hop target to L_failed.  Makes for denser prologue code.
1673    __ BIND(L_failed_0);
1674    __ jmp(L_failed);
1675    assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
1676
1677    __ align(CodeEntryAlignment);
1678    address start = __ pc();
1679
1680    __ enter(); // required for proper stackwalking of RuntimeStub frame
1681    __ push(rsi);
1682    __ push(rdi);
1683
1684    // bump this on entry, not on exit:
1685    inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1686
1687    // Input values
1688    Address SRC     (rsp, 12+ 4);
1689    Address SRC_POS (rsp, 12+ 8);
1690    Address DST     (rsp, 12+12);
1691    Address DST_POS (rsp, 12+16);
1692    Address LENGTH  (rsp, 12+20);
1693
1694    //-----------------------------------------------------------------------
1695    // Assembler stub will be used for this call to arraycopy
1696    // if the following conditions are met:
1697    //
1698    // (1) src and dst must not be null.
1699    // (2) src_pos must not be negative.
1700    // (3) dst_pos must not be negative.
1701    // (4) length  must not be negative.
1702    // (5) src klass and dst klass should be the same and not NULL.
1703    // (6) src and dst should be arrays.
1704    // (7) src_pos + length must not exceed length of src.
1705    // (8) dst_pos + length must not exceed length of dst.
1706    //
1707
1708    const Register src     = rax;       // source array oop
1709    const Register src_pos = rsi;
1710    const Register dst     = rdx;       // destination array oop
1711    const Register dst_pos = rdi;
1712    const Register length  = rcx;       // transfer count
1713
1714    //  if (src == NULL) return -1;
1715    __ movptr(src, SRC);      // src oop
1716    __ testptr(src, src);
1717    __ jccb(Assembler::zero, L_failed_0);
1718
1719    //  if (src_pos < 0) return -1;
1720    __ movl2ptr(src_pos, SRC_POS);  // src_pos
1721    __ testl(src_pos, src_pos);
1722    __ jccb(Assembler::negative, L_failed_0);
1723
1724    //  if (dst == NULL) return -1;
1725    __ movptr(dst, DST);      // dst oop
1726    __ testptr(dst, dst);
1727    __ jccb(Assembler::zero, L_failed_0);
1728
1729    //  if (dst_pos < 0) return -1;
1730    __ movl2ptr(dst_pos, DST_POS);  // dst_pos
1731    __ testl(dst_pos, dst_pos);
1732    __ jccb(Assembler::negative, L_failed_0);
1733
1734    //  if (length < 0) return -1;
1735    __ movl2ptr(length, LENGTH);   // length
1736    __ testl(length, length);
1737    __ jccb(Assembler::negative, L_failed_0);
1738
1739    //  if (src->klass() == NULL) return -1;
1740    Address src_klass_addr(src, oopDesc::klass_offset_in_bytes());
1741    Address dst_klass_addr(dst, oopDesc::klass_offset_in_bytes());
1742    const Register rcx_src_klass = rcx;    // array klass
1743    __ movptr(rcx_src_klass, Address(src, oopDesc::klass_offset_in_bytes()));
1744
1745#ifdef ASSERT
1746    //  assert(src->klass() != NULL);
1747    BLOCK_COMMENT("assert klasses not null");
1748    { Label L1, L2;
1749      __ testptr(rcx_src_klass, rcx_src_klass);
1750      __ jccb(Assembler::notZero, L2);   // it is broken if klass is NULL
1751      __ bind(L1);
1752      __ stop("broken null klass");
1753      __ bind(L2);
1754      __ cmpptr(dst_klass_addr, (int32_t)NULL_WORD);
1755      __ jccb(Assembler::equal, L1);      // this would be broken also
1756      BLOCK_COMMENT("assert done");
1757    }
1758#endif //ASSERT
1759
1760    // Load layout helper (32-bits)
1761    //
1762    //  |array_tag|     | header_size | element_type |     |log2_element_size|
1763    // 32        30    24            16              8     2                 0
1764    //
1765    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1766    //
1767
1768    int lh_offset = klassOopDesc::header_size() * HeapWordSize +
1769                    Klass::layout_helper_offset_in_bytes();
1770    Address src_klass_lh_addr(rcx_src_klass, lh_offset);
1771
1772    // Handle objArrays completely differently...
1773    jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1774    __ cmpl(src_klass_lh_addr, objArray_lh);
1775    __ jcc(Assembler::equal, L_objArray);
1776
1777    //  if (src->klass() != dst->klass()) return -1;
1778    __ cmpptr(rcx_src_klass, dst_klass_addr);
1779    __ jccb(Assembler::notEqual, L_failed_0);
1780
1781    const Register rcx_lh = rcx;  // layout helper
1782    assert(rcx_lh == rcx_src_klass, "known alias");
1783    __ movl(rcx_lh, src_klass_lh_addr);
1784
1785    //  if (!src->is_Array()) return -1;
1786    __ cmpl(rcx_lh, Klass::_lh_neutral_value);
1787    __ jcc(Assembler::greaterEqual, L_failed_0); // signed cmp
1788
1789    // At this point, it is known to be a typeArray (array_tag 0x3).
1790#ifdef ASSERT
1791    { Label L;
1792      __ cmpl(rcx_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1793      __ jcc(Assembler::greaterEqual, L); // signed cmp
1794      __ stop("must be a primitive array");
1795      __ bind(L);
1796    }
1797#endif
1798
1799    assert_different_registers(src, src_pos, dst, dst_pos, rcx_lh);
1800    arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1801
1802    // typeArrayKlass
1803    //
1804    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
1805    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
1806    //
1807    const Register rsi_offset = rsi; // array offset
1808    const Register src_array  = src; // src array offset
1809    const Register dst_array  = dst; // dst array offset
1810    const Register rdi_elsize = rdi; // log2 element size
1811
1812    __ mov(rsi_offset, rcx_lh);
1813    __ shrptr(rsi_offset, Klass::_lh_header_size_shift);
1814    __ andptr(rsi_offset, Klass::_lh_header_size_mask);   // array_offset
1815    __ addptr(src_array, rsi_offset);  // src array offset
1816    __ addptr(dst_array, rsi_offset);  // dst array offset
1817    __ andptr(rcx_lh, Klass::_lh_log2_element_size_mask); // log2 elsize
1818
1819    // next registers should be set before the jump to corresponding stub
1820    const Register from       = src; // source array address
1821    const Register to         = dst; // destination array address
1822    const Register count      = rcx; // elements count
1823    // some of them should be duplicated on stack
1824#define FROM   Address(rsp, 12+ 4)
1825#define TO     Address(rsp, 12+ 8)   // Not used now
1826#define COUNT  Address(rsp, 12+12)   // Only for oop arraycopy
1827
1828    BLOCK_COMMENT("scale indexes to element size");
1829    __ movl2ptr(rsi, SRC_POS);  // src_pos
1830    __ shlptr(rsi);             // src_pos << rcx (log2 elsize)
1831    assert(src_array == from, "");
1832    __ addptr(from, rsi);       // from = src_array + SRC_POS << log2 elsize
1833    __ movl2ptr(rdi, DST_POS);  // dst_pos
1834    __ shlptr(rdi);             // dst_pos << rcx (log2 elsize)
1835    assert(dst_array == to, "");
1836    __ addptr(to,  rdi);        // to   = dst_array + DST_POS << log2 elsize
1837    __ movptr(FROM, from);      // src_addr
1838    __ mov(rdi_elsize, rcx_lh); // log2 elsize
1839    __ movl2ptr(count, LENGTH); // elements count
1840
1841    BLOCK_COMMENT("choose copy loop based on element size");
1842    __ cmpl(rdi_elsize, 0);
1843
1844    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jbyte_arraycopy));
1845    __ cmpl(rdi_elsize, LogBytesPerShort);
1846    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jshort_arraycopy));
1847    __ cmpl(rdi_elsize, LogBytesPerInt);
1848    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jint_arraycopy));
1849#ifdef ASSERT
1850    __ cmpl(rdi_elsize, LogBytesPerLong);
1851    __ jccb(Assembler::notEqual, L_failed);
1852#endif
1853    __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1854    __ pop(rsi);
1855    __ jump(RuntimeAddress(entry_jlong_arraycopy));
1856
1857  __ BIND(L_failed);
1858    __ xorptr(rax, rax);
1859    __ notptr(rax); // return -1
1860    __ pop(rdi);
1861    __ pop(rsi);
1862    __ leave(); // required for proper stackwalking of RuntimeStub frame
1863    __ ret(0);
1864
1865    // objArrayKlass
1866  __ BIND(L_objArray);
1867    // live at this point:  rcx_src_klass, src[_pos], dst[_pos]
1868
1869    Label L_plain_copy, L_checkcast_copy;
1870    //  test array classes for subtyping
1871    __ cmpptr(rcx_src_klass, dst_klass_addr); // usual case is exact equality
1872    __ jccb(Assembler::notEqual, L_checkcast_copy);
1873
1874    // Identically typed arrays can be copied without element-wise checks.
1875    assert_different_registers(src, src_pos, dst, dst_pos, rcx_src_klass);
1876    arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1877
1878  __ BIND(L_plain_copy);
1879    __ movl2ptr(count, LENGTH); // elements count
1880    __ movl2ptr(src_pos, SRC_POS);  // reload src_pos
1881    __ lea(from, Address(src, src_pos, Address::times_ptr,
1882                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
1883    __ movl2ptr(dst_pos, DST_POS);  // reload dst_pos
1884    __ lea(to,   Address(dst, dst_pos, Address::times_ptr,
1885                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
1886    __ movptr(FROM,  from);   // src_addr
1887    __ movptr(TO,    to);     // dst_addr
1888    __ movl(COUNT, count);  // count
1889    __ jump(RuntimeAddress(entry_oop_arraycopy));
1890
1891  __ BIND(L_checkcast_copy);
1892    // live at this point:  rcx_src_klass, dst[_pos], src[_pos]
1893    {
1894      // Handy offsets:
1895      int  ek_offset = (klassOopDesc::header_size() * HeapWordSize +
1896                        objArrayKlass::element_klass_offset_in_bytes());
1897      int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
1898                        Klass::super_check_offset_offset_in_bytes());
1899
1900      Register rsi_dst_klass = rsi;
1901      Register rdi_temp      = rdi;
1902      assert(rsi_dst_klass == src_pos, "expected alias w/ src_pos");
1903      assert(rdi_temp      == dst_pos, "expected alias w/ dst_pos");
1904      Address dst_klass_lh_addr(rsi_dst_klass, lh_offset);
1905
1906      // Before looking at dst.length, make sure dst is also an objArray.
1907      __ movptr(rsi_dst_klass, dst_klass_addr);
1908      __ cmpl(dst_klass_lh_addr, objArray_lh);
1909      __ jccb(Assembler::notEqual, L_failed);
1910
1911      // It is safe to examine both src.length and dst.length.
1912      __ movl2ptr(src_pos, SRC_POS);        // reload rsi
1913      arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1914      // (Now src_pos and dst_pos are killed, but not src and dst.)
1915
1916      // We'll need this temp (don't forget to pop it after the type check).
1917      __ push(rbx);
1918      Register rbx_src_klass = rbx;
1919
1920      __ mov(rbx_src_klass, rcx_src_klass); // spill away from rcx
1921      __ movptr(rsi_dst_klass, dst_klass_addr);
1922      Address super_check_offset_addr(rsi_dst_klass, sco_offset);
1923      Label L_fail_array_check;
1924      generate_type_check(rbx_src_klass,
1925                          super_check_offset_addr, dst_klass_addr,
1926                          rdi_temp, NULL, &L_fail_array_check);
1927      // (On fall-through, we have passed the array type check.)
1928      __ pop(rbx);
1929      __ jmp(L_plain_copy);
1930
1931      __ BIND(L_fail_array_check);
1932      // Reshuffle arguments so we can call checkcast_arraycopy:
1933
1934      // match initial saves for checkcast_arraycopy
1935      // push(rsi);    // already done; see above
1936      // push(rdi);    // already done; see above
1937      // push(rbx);    // already done; see above
1938
1939      // Marshal outgoing arguments now, freeing registers.
1940      Address   from_arg(rsp, 16+ 4);   // from
1941      Address     to_arg(rsp, 16+ 8);   // to
1942      Address length_arg(rsp, 16+12);   // elements count
1943      Address  ckoff_arg(rsp, 16+16);   // super_check_offset
1944      Address  ckval_arg(rsp, 16+20);   // super_klass
1945
1946      Address SRC_POS_arg(rsp, 16+ 8);
1947      Address DST_POS_arg(rsp, 16+16);
1948      Address  LENGTH_arg(rsp, 16+20);
1949      // push rbx, changed the incoming offsets (why not just use rbp,??)
1950      // assert(SRC_POS_arg.disp() == SRC_POS.disp() + 4, "");
1951
1952      __ movptr(rbx, Address(rsi_dst_klass, ek_offset));
1953      __ movl2ptr(length, LENGTH_arg);    // reload elements count
1954      __ movl2ptr(src_pos, SRC_POS_arg);  // reload src_pos
1955      __ movl2ptr(dst_pos, DST_POS_arg);  // reload dst_pos
1956
1957      __ movptr(ckval_arg, rbx);          // destination element type
1958      __ movl(rbx, Address(rbx, sco_offset));
1959      __ movl(ckoff_arg, rbx);          // corresponding class check offset
1960
1961      __ movl(length_arg, length);      // outgoing length argument
1962
1963      __ lea(from, Address(src, src_pos, Address::times_ptr,
1964                            arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
1965      __ movptr(from_arg, from);
1966
1967      __ lea(to, Address(dst, dst_pos, Address::times_ptr,
1968                          arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
1969      __ movptr(to_arg, to);
1970      __ jump(RuntimeAddress(entry_checkcast_arraycopy));
1971    }
1972
1973    return start;
1974  }
1975
1976  void generate_arraycopy_stubs() {
1977    address entry;
1978    address entry_jbyte_arraycopy;
1979    address entry_jshort_arraycopy;
1980    address entry_jint_arraycopy;
1981    address entry_oop_arraycopy;
1982    address entry_jlong_arraycopy;
1983    address entry_checkcast_arraycopy;
1984
1985    StubRoutines::_arrayof_jbyte_disjoint_arraycopy =
1986        generate_disjoint_copy(T_BYTE,  true, Address::times_1, &entry,
1987                               "arrayof_jbyte_disjoint_arraycopy");
1988    StubRoutines::_arrayof_jbyte_arraycopy =
1989        generate_conjoint_copy(T_BYTE,  true, Address::times_1,  entry,
1990                               NULL, "arrayof_jbyte_arraycopy");
1991    StubRoutines::_jbyte_disjoint_arraycopy =
1992        generate_disjoint_copy(T_BYTE, false, Address::times_1, &entry,
1993                               "jbyte_disjoint_arraycopy");
1994    StubRoutines::_jbyte_arraycopy =
1995        generate_conjoint_copy(T_BYTE, false, Address::times_1,  entry,
1996                               &entry_jbyte_arraycopy, "jbyte_arraycopy");
1997
1998    StubRoutines::_arrayof_jshort_disjoint_arraycopy =
1999        generate_disjoint_copy(T_SHORT,  true, Address::times_2, &entry,
2000                               "arrayof_jshort_disjoint_arraycopy");
2001    StubRoutines::_arrayof_jshort_arraycopy =
2002        generate_conjoint_copy(T_SHORT,  true, Address::times_2,  entry,
2003                               NULL, "arrayof_jshort_arraycopy");
2004    StubRoutines::_jshort_disjoint_arraycopy =
2005        generate_disjoint_copy(T_SHORT, false, Address::times_2, &entry,
2006                               "jshort_disjoint_arraycopy");
2007    StubRoutines::_jshort_arraycopy =
2008        generate_conjoint_copy(T_SHORT, false, Address::times_2,  entry,
2009                               &entry_jshort_arraycopy, "jshort_arraycopy");
2010
2011    // Next arrays are always aligned on 4 bytes at least.
2012    StubRoutines::_jint_disjoint_arraycopy =
2013        generate_disjoint_copy(T_INT, true, Address::times_4, &entry,
2014                               "jint_disjoint_arraycopy");
2015    StubRoutines::_jint_arraycopy =
2016        generate_conjoint_copy(T_INT, true, Address::times_4,  entry,
2017                               &entry_jint_arraycopy, "jint_arraycopy");
2018
2019    StubRoutines::_oop_disjoint_arraycopy =
2020        generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2021                               "oop_disjoint_arraycopy");
2022    StubRoutines::_oop_arraycopy =
2023        generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
2024                               &entry_oop_arraycopy, "oop_arraycopy");
2025
2026    StubRoutines::_jlong_disjoint_arraycopy =
2027        generate_disjoint_long_copy(&entry, "jlong_disjoint_arraycopy");
2028    StubRoutines::_jlong_arraycopy =
2029        generate_conjoint_long_copy(entry, &entry_jlong_arraycopy,
2030                                    "jlong_arraycopy");
2031
2032    StubRoutines::_arrayof_jint_disjoint_arraycopy  =
2033        StubRoutines::_jint_disjoint_arraycopy;
2034    StubRoutines::_arrayof_oop_disjoint_arraycopy   =
2035        StubRoutines::_oop_disjoint_arraycopy;
2036    StubRoutines::_arrayof_jlong_disjoint_arraycopy =
2037        StubRoutines::_jlong_disjoint_arraycopy;
2038
2039    StubRoutines::_arrayof_jint_arraycopy  = StubRoutines::_jint_arraycopy;
2040    StubRoutines::_arrayof_oop_arraycopy   = StubRoutines::_oop_arraycopy;
2041    StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy;
2042
2043    StubRoutines::_checkcast_arraycopy =
2044        generate_checkcast_copy("checkcast_arraycopy",
2045                                  &entry_checkcast_arraycopy);
2046
2047    StubRoutines::_unsafe_arraycopy =
2048        generate_unsafe_copy("unsafe_arraycopy",
2049                               entry_jbyte_arraycopy,
2050                               entry_jshort_arraycopy,
2051                               entry_jint_arraycopy,
2052                               entry_jlong_arraycopy);
2053
2054    StubRoutines::_generic_arraycopy =
2055        generate_generic_copy("generic_arraycopy",
2056                               entry_jbyte_arraycopy,
2057                               entry_jshort_arraycopy,
2058                               entry_jint_arraycopy,
2059                               entry_oop_arraycopy,
2060                               entry_jlong_arraycopy,
2061                               entry_checkcast_arraycopy);
2062  }
2063
2064 public:
2065  // Information about frame layout at time of blocking runtime call.
2066  // Note that we only have to preserve callee-saved registers since
2067  // the compilers are responsible for supplying a continuation point
2068  // if they expect all registers to be preserved.
2069  enum layout {
2070    thread_off,    // last_java_sp
2071    rbp_off,       // callee saved register
2072    ret_pc,
2073    framesize
2074  };
2075
2076 private:
2077
2078#undef  __
2079#define __ masm->
2080
2081  //------------------------------------------------------------------------------------------------------------------------
2082  // Continuation point for throwing of implicit exceptions that are not handled in
2083  // the current activation. Fabricates an exception oop and initiates normal
2084  // exception dispatching in this frame.
2085  //
2086  // Previously the compiler (c2) allowed for callee save registers on Java calls.
2087  // This is no longer true after adapter frames were removed but could possibly
2088  // be brought back in the future if the interpreter code was reworked and it
2089  // was deemed worthwhile. The comment below was left to describe what must
2090  // happen here if callee saves were resurrected. As it stands now this stub
2091  // could actually be a vanilla BufferBlob and have now oopMap at all.
2092  // Since it doesn't make much difference we've chosen to leave it the
2093  // way it was in the callee save days and keep the comment.
2094
2095  // If we need to preserve callee-saved values we need a callee-saved oop map and
2096  // therefore have to make these stubs into RuntimeStubs rather than BufferBlobs.
2097  // If the compiler needs all registers to be preserved between the fault
2098  // point and the exception handler then it must assume responsibility for that in
2099  // AbstractCompiler::continuation_for_implicit_null_exception or
2100  // continuation_for_implicit_division_by_zero_exception. All other implicit
2101  // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
2102  // either at call sites or otherwise assume that stack unwinding will be initiated,
2103  // so caller saved registers were assumed volatile in the compiler.
2104  address generate_throw_exception(const char* name, address runtime_entry,
2105                                   bool restore_saved_exception_pc) {
2106
2107    int insts_size = 256;
2108    int locs_size  = 32;
2109
2110    CodeBuffer code(name, insts_size, locs_size);
2111    OopMapSet* oop_maps  = new OopMapSet();
2112    MacroAssembler* masm = new MacroAssembler(&code);
2113
2114    address start = __ pc();
2115
2116    // This is an inlined and slightly modified version of call_VM
2117    // which has the ability to fetch the return PC out of
2118    // thread-local storage and also sets up last_Java_sp slightly
2119    // differently than the real call_VM
2120    Register java_thread = rbx;
2121    __ get_thread(java_thread);
2122    if (restore_saved_exception_pc) {
2123      __ movptr(rax, Address(java_thread, in_bytes(JavaThread::saved_exception_pc_offset())));
2124      __ push(rax);
2125    }
2126
2127    __ enter(); // required for proper stackwalking of RuntimeStub frame
2128
2129    // pc and rbp, already pushed
2130    __ subptr(rsp, (framesize-2) * wordSize); // prolog
2131
2132    // Frame is now completed as far as size and linkage.
2133
2134    int frame_complete = __ pc() - start;
2135
2136    // push java thread (becomes first argument of C function)
2137    __ movptr(Address(rsp, thread_off * wordSize), java_thread);
2138
2139    // Set up last_Java_sp and last_Java_fp
2140    __ set_last_Java_frame(java_thread, rsp, rbp, NULL);
2141
2142    // Call runtime
2143    BLOCK_COMMENT("call runtime_entry");
2144    __ call(RuntimeAddress(runtime_entry));
2145    // Generate oop map
2146    OopMap* map =  new OopMap(framesize, 0);
2147    oop_maps->add_gc_map(__ pc() - start, map);
2148
2149    // restore the thread (cannot use the pushed argument since arguments
2150    // may be overwritten by C code generated by an optimizing compiler);
2151    // however can use the register value directly if it is callee saved.
2152    __ get_thread(java_thread);
2153
2154    __ reset_last_Java_frame(java_thread, true, false);
2155
2156    __ leave(); // required for proper stackwalking of RuntimeStub frame
2157
2158    // check for pending exceptions
2159#ifdef ASSERT
2160    Label L;
2161    __ cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
2162    __ jcc(Assembler::notEqual, L);
2163    __ should_not_reach_here();
2164    __ bind(L);
2165#endif /* ASSERT */
2166    __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2167
2168
2169    RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, framesize, oop_maps, false);
2170    return stub->entry_point();
2171  }
2172
2173
2174  void create_control_words() {
2175    // Round to nearest, 53-bit mode, exceptions masked
2176    StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
2177    // Round to zero, 53-bit mode, exception mased
2178    StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
2179    // Round to nearest, 24-bit mode, exceptions masked
2180    StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
2181    // Round to nearest, 64-bit mode, exceptions masked
2182    StubRoutines::_fpu_cntrl_wrd_64    = 0x037F;
2183    // Round to nearest, 64-bit mode, exceptions masked
2184    StubRoutines::_mxcsr_std           = 0x1F80;
2185    // Note: the following two constants are 80-bit values
2186    //       layout is critical for correct loading by FPU.
2187    // Bias for strict fp multiply/divide
2188    StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
2189    StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
2190    StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
2191    // Un-Bias for strict fp multiply/divide
2192    StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
2193    StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
2194    StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
2195  }
2196
2197  //---------------------------------------------------------------------------
2198  // Initialization
2199
2200  void generate_initial() {
2201    // Generates all stubs and initializes the entry points
2202
2203    //------------------------------------------------------------------------------------------------------------------------
2204    // entry points that exist in all platforms
2205    // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
2206    //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
2207    StubRoutines::_forward_exception_entry      = generate_forward_exception();
2208
2209    StubRoutines::_call_stub_entry              =
2210      generate_call_stub(StubRoutines::_call_stub_return_address);
2211    // is referenced by megamorphic call
2212    StubRoutines::_catch_exception_entry        = generate_catch_exception();
2213
2214    // These are currently used by Solaris/Intel
2215    StubRoutines::_atomic_xchg_entry            = generate_atomic_xchg();
2216
2217    StubRoutines::_handler_for_unsafe_access_entry =
2218      generate_handler_for_unsafe_access();
2219
2220    // platform dependent
2221    create_control_words();
2222
2223    StubRoutines::x86::_verify_mxcsr_entry                 = generate_verify_mxcsr();
2224    StubRoutines::x86::_verify_fpu_cntrl_wrd_entry         = generate_verify_fpu_cntrl_wrd();
2225    StubRoutines::_d2i_wrapper                              = generate_d2i_wrapper(T_INT,
2226                                                                                   CAST_FROM_FN_PTR(address, SharedRuntime::d2i));
2227    StubRoutines::_d2l_wrapper                              = generate_d2i_wrapper(T_LONG,
2228                                                                                   CAST_FROM_FN_PTR(address, SharedRuntime::d2l));
2229  }
2230
2231
2232  void generate_all() {
2233    // Generates all stubs and initializes the entry points
2234
2235    // These entry points require SharedInfo::stack0 to be set up in non-core builds
2236    // and need to be relocatable, so they each fabricate a RuntimeStub internally.
2237    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
2238    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);
2239    StubRoutines::_throw_ArithmeticException_entry         = generate_throw_exception("ArithmeticException throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException),  true);
2240    StubRoutines::_throw_NullPointerException_entry        = generate_throw_exception("NullPointerException throw_exception",         CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
2241    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
2242    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError),   false);
2243
2244    //------------------------------------------------------------------------------------------------------------------------
2245    // entry points that are platform specific
2246
2247    // support for verify_oop (must happen after universe_init)
2248    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
2249
2250    // arraycopy stubs used by compilers
2251    generate_arraycopy_stubs();
2252  }
2253
2254
2255 public:
2256  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2257    if (all) {
2258      generate_all();
2259    } else {
2260      generate_initial();
2261    }
2262  }
2263}; // end class declaration
2264
2265
2266void StubGenerator_generate(CodeBuffer* code, bool all) {
2267  StubGenerator g(code, all);
2268}
2269