stubGenerator_sparc.cpp revision 3864:f34d701e952e
155714Skris/*
2280297Sjkim * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
3280297Sjkim * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4280297Sjkim *
555714Skris * This code is free software; you can redistribute it and/or modify it
655714Skris * under the terms of the GNU General Public License version 2 only, as
755714Skris * published by the Free Software Foundation.
855714Skris *
955714Skris * This code is distributed in the hope that it will be useful, but WITHOUT
1055714Skris * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
1155714Skris * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1255714Skris * version 2 for more details (a copy is included in the LICENSE file that
1355714Skris * accompanied this code).
14280297Sjkim *
1555714Skris * You should have received a copy of the GNU General Public License version
1655714Skris * 2 along with this work; if not, write to the Free Software Foundation,
1755714Skris * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
1855714Skris *
1955714Skris * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
2055714Skris * or visit www.oracle.com if you need additional information or have any
2155714Skris * questions.
2255714Skris *
2355714Skris */
2455714Skris
2555714Skris#include "precompiled.hpp"
2655714Skris#include "asm/assembler.hpp"
2755714Skris#include "assembler_sparc.inline.hpp"
2855714Skris#include "interpreter/interpreter.hpp"
2955714Skris#include "nativeInst_sparc.hpp"
3055714Skris#include "oops/instanceOop.hpp"
3155714Skris#include "oops/method.hpp"
3255714Skris#include "oops/objArrayKlass.hpp"
3355714Skris#include "oops/oop.inline.hpp"
3455714Skris#include "prims/methodHandles.hpp"
3555714Skris#include "runtime/frame.inline.hpp"
3655714Skris#include "runtime/handles.inline.hpp"
3755714Skris#include "runtime/sharedRuntime.hpp"
3855714Skris#include "runtime/stubCodeGenerator.hpp"
3955714Skris#include "runtime/stubRoutines.hpp"
4055714Skris#include "runtime/thread.inline.hpp"
4155714Skris#include "utilities/top.hpp"
4255714Skris#ifdef COMPILER2
4355714Skris#include "opto/runtime.hpp"
4455714Skris#endif
4555714Skris
4655714Skris// Declaration and definition of StubGenerator (no .hpp file).
4755714Skris// For a more detailed description of the stub routine structure
4855714Skris// see the comment in stubRoutines.hpp.
4955714Skris
5055714Skris#define __ _masm->
5155714Skris
5255714Skris#ifdef PRODUCT
5355714Skris#define BLOCK_COMMENT(str) /* nothing */
5455714Skris#else
5555714Skris#define BLOCK_COMMENT(str) __ block_comment(str)
5655714Skris#endif
5755714Skris
5855714Skris#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
5955714Skris
6055714Skris// Note:  The register L7 is used as L7_thread_cache, and may not be used
6155714Skris//        any other way within this module.
6255714Skris
6355714Skris
6455714Skrisstatic const Register& Lstub_temp = L2;
6555714Skris
6655714Skris// -------------------------------------------------------------------------------------------------------------------------
6755714Skris// Stub Code definitions
6855714Skris
6955714Skrisstatic address handle_unsafe_access() {
7055714Skris  JavaThread* thread = JavaThread::current();
71280297Sjkim  address pc  = thread->saved_exception_pc();
72280297Sjkim  address npc = thread->saved_exception_npc();
73280297Sjkim  // pc is the instruction which we must emulate
74280297Sjkim  // doing a no-op is fine:  return garbage from the load
75280297Sjkim
76280297Sjkim  // request an async exception
77280297Sjkim  thread->set_pending_unsafe_access_error();
78280297Sjkim
79280297Sjkim  // return address of next instruction to execute
80280297Sjkim  return npc;
81280297Sjkim}
82280297Sjkim
83280297Sjkimclass StubGenerator: public StubCodeGenerator {
84280297Sjkim private:
85280297Sjkim
86280297Sjkim#ifdef PRODUCT
87280297Sjkim#define inc_counter_np(a,b,c) (0)
88280297Sjkim#else
89280297Sjkim#define inc_counter_np(counter, t1, t2) \
90280297Sjkim  BLOCK_COMMENT("inc_counter " #counter); \
91280297Sjkim  __ inc_counter(&counter, t1, t2);
92280297Sjkim#endif
9355714Skris
94280297Sjkim  //----------------------------------------------------------------------------------------------------
95280297Sjkim  // Call stubs are used to call Java from C
96280297Sjkim
97280297Sjkim  address generate_call_stub(address& return_pc) {
98280297Sjkim    StubCodeMark mark(this, "StubRoutines", "call_stub");
99280297Sjkim    address start = __ pc();
100280297Sjkim
101280297Sjkim    // Incoming arguments:
102280297Sjkim    //
103280297Sjkim    // o0         : call wrapper address
104280297Sjkim    // o1         : result (address)
10555714Skris    // o2         : result type
106280297Sjkim    // o3         : method
107280297Sjkim    // o4         : (interpreter) entry point
10855714Skris    // o5         : parameters (address)
109280297Sjkim    // [sp + 0x5c]: parameter size (in words)
110280297Sjkim    // [sp + 0x60]: thread
111280297Sjkim    //
112280297Sjkim    // +---------------+ <--- sp + 0
113280297Sjkim    // |               |
11455714Skris    // . reg save area .
115280297Sjkim    // |               |
116280297Sjkim    // +---------------+ <--- sp + 0x40
117280297Sjkim    // |               |
118280297Sjkim    // . extra 7 slots .
119280297Sjkim    // |               |
120280297Sjkim    // +---------------+ <--- sp + 0x5c
121280297Sjkim    // |  param. size  |
122280297Sjkim    // +---------------+ <--- sp + 0x60
123280297Sjkim    // |    thread     |
124280297Sjkim    // +---------------+
125280297Sjkim    // |               |
126280297Sjkim
127280297Sjkim    // note: if the link argument position changes, adjust
128280297Sjkim    //       the code in frame::entry_frame_call_wrapper()
12955714Skris
130    const Argument link           = Argument(0, false); // used only for GC
131    const Argument result         = Argument(1, false);
132    const Argument result_type    = Argument(2, false);
133    const Argument method         = Argument(3, false);
134    const Argument entry_point    = Argument(4, false);
135    const Argument parameters     = Argument(5, false);
136    const Argument parameter_size = Argument(6, false);
137    const Argument thread         = Argument(7, false);
138
139    // setup thread register
140    __ ld_ptr(thread.as_address(), G2_thread);
141    __ reinit_heapbase();
142
143#ifdef ASSERT
144    // make sure we have no pending exceptions
145    { const Register t = G3_scratch;
146      Label L;
147      __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
148      __ br_null_short(t, Assembler::pt, L);
149      __ stop("StubRoutines::call_stub: entered with pending exception");
150      __ bind(L);
151    }
152#endif
153
154    // create activation frame & allocate space for parameters
155    { const Register t = G3_scratch;
156      __ ld_ptr(parameter_size.as_address(), t);                // get parameter size (in words)
157      __ add(t, frame::memory_parameter_word_sp_offset, t);     // add space for save area (in words)
158      __ round_to(t, WordsPerLong);                             // make sure it is multiple of 2 (in words)
159      __ sll(t, Interpreter::logStackElementSize, t);           // compute number of bytes
160      __ neg(t);                                                // negate so it can be used with save
161      __ save(SP, t, SP);                                       // setup new frame
162    }
163
164    // +---------------+ <--- sp + 0
165    // |               |
166    // . reg save area .
167    // |               |
168    // +---------------+ <--- sp + 0x40
169    // |               |
170    // . extra 7 slots .
171    // |               |
172    // +---------------+ <--- sp + 0x5c
173    // |  empty slot   |      (only if parameter size is even)
174    // +---------------+
175    // |               |
176    // .  parameters   .
177    // |               |
178    // +---------------+ <--- fp + 0
179    // |               |
180    // . reg save area .
181    // |               |
182    // +---------------+ <--- fp + 0x40
183    // |               |
184    // . extra 7 slots .
185    // |               |
186    // +---------------+ <--- fp + 0x5c
187    // |  param. size  |
188    // +---------------+ <--- fp + 0x60
189    // |    thread     |
190    // +---------------+
191    // |               |
192
193    // pass parameters if any
194    BLOCK_COMMENT("pass parameters if any");
195    { const Register src = parameters.as_in().as_register();
196      const Register dst = Lentry_args;
197      const Register tmp = G3_scratch;
198      const Register cnt = G4_scratch;
199
200      // test if any parameters & setup of Lentry_args
201      Label exit;
202      __ ld_ptr(parameter_size.as_in().as_address(), cnt);      // parameter counter
203      __ add( FP, STACK_BIAS, dst );
204      __ cmp_zero_and_br(Assembler::zero, cnt, exit);
205      __ delayed()->sub(dst, BytesPerWord, dst);                 // setup Lentry_args
206
207      // copy parameters if any
208      Label loop;
209      __ BIND(loop);
210      // Store parameter value
211      __ ld_ptr(src, 0, tmp);
212      __ add(src, BytesPerWord, src);
213      __ st_ptr(tmp, dst, 0);
214      __ deccc(cnt);
215      __ br(Assembler::greater, false, Assembler::pt, loop);
216      __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
217
218      // done
219      __ BIND(exit);
220    }
221
222    // setup parameters, method & call Java function
223#ifdef ASSERT
224    // layout_activation_impl checks it's notion of saved SP against
225    // this register, so if this changes update it as well.
226    const Register saved_SP = Lscratch;
227    __ mov(SP, saved_SP);                               // keep track of SP before call
228#endif
229
230    // setup parameters
231    const Register t = G3_scratch;
232    __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
233    __ sll(t, Interpreter::logStackElementSize, t);    // compute number of bytes
234    __ sub(FP, t, Gargs);                              // setup parameter pointer
235#ifdef _LP64
236    __ add( Gargs, STACK_BIAS, Gargs );                // Account for LP64 stack bias
237#endif
238    __ mov(SP, O5_savedSP);
239
240
241    // do the call
242    //
243    // the following register must be setup:
244    //
245    // G2_thread
246    // G5_method
247    // Gargs
248    BLOCK_COMMENT("call Java function");
249    __ jmpl(entry_point.as_in().as_register(), G0, O7);
250    __ delayed()->mov(method.as_in().as_register(), G5_method);   // setup method
251
252    BLOCK_COMMENT("call_stub_return_address:");
253    return_pc = __ pc();
254
255    // The callee, if it wasn't interpreted, can return with SP changed so
256    // we can no longer assert of change of SP.
257
258    // store result depending on type
259    // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
260    //  is treated as T_INT)
261    { const Register addr = result     .as_in().as_register();
262      const Register type = result_type.as_in().as_register();
263      Label is_long, is_float, is_double, is_object, exit;
264      __            cmp(type, T_OBJECT);  __ br(Assembler::equal, false, Assembler::pn, is_object);
265      __ delayed()->cmp(type, T_FLOAT);   __ br(Assembler::equal, false, Assembler::pn, is_float);
266      __ delayed()->cmp(type, T_DOUBLE);  __ br(Assembler::equal, false, Assembler::pn, is_double);
267      __ delayed()->cmp(type, T_LONG);    __ br(Assembler::equal, false, Assembler::pn, is_long);
268      __ delayed()->nop();
269
270      // store int result
271      __ st(O0, addr, G0);
272
273      __ BIND(exit);
274      __ ret();
275      __ delayed()->restore();
276
277      __ BIND(is_object);
278      __ ba(exit);
279      __ delayed()->st_ptr(O0, addr, G0);
280
281      __ BIND(is_float);
282      __ ba(exit);
283      __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
284
285      __ BIND(is_double);
286      __ ba(exit);
287      __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
288
289      __ BIND(is_long);
290#ifdef _LP64
291      __ ba(exit);
292      __ delayed()->st_long(O0, addr, G0);      // store entire long
293#else
294#if defined(COMPILER2)
295  // All return values are where we want them, except for Longs.  C2 returns
296  // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1.
297  // Since the interpreter will return longs in G1 and O0/O1 in the 32bit
298  // build we simply always use G1.
299  // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to
300  // do this here. Unfortunately if we did a rethrow we'd see an machepilog node
301  // first which would move g1 -> O0/O1 and destroy the exception we were throwing.
302
303      __ ba(exit);
304      __ delayed()->stx(G1, addr, G0);  // store entire long
305#else
306      __ st(O1, addr, BytesPerInt);
307      __ ba(exit);
308      __ delayed()->st(O0, addr, G0);
309#endif /* COMPILER2 */
310#endif /* _LP64 */
311     }
312     return start;
313  }
314
315
316  //----------------------------------------------------------------------------------------------------
317  // Return point for a Java call if there's an exception thrown in Java code.
318  // The exception is caught and transformed into a pending exception stored in
319  // JavaThread that can be tested from within the VM.
320  //
321  // Oexception: exception oop
322
323  address generate_catch_exception() {
324    StubCodeMark mark(this, "StubRoutines", "catch_exception");
325
326    address start = __ pc();
327    // verify that thread corresponds
328    __ verify_thread();
329
330    const Register& temp_reg = Gtemp;
331    Address pending_exception_addr    (G2_thread, Thread::pending_exception_offset());
332    Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset   ());
333    Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset   ());
334
335    // set pending exception
336    __ verify_oop(Oexception);
337    __ st_ptr(Oexception, pending_exception_addr);
338    __ set((intptr_t)__FILE__, temp_reg);
339    __ st_ptr(temp_reg, exception_file_offset_addr);
340    __ set((intptr_t)__LINE__, temp_reg);
341    __ st(temp_reg, exception_line_offset_addr);
342
343    // complete return to VM
344    assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
345
346    AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
347    __ jump_to(stub_ret, temp_reg);
348    __ delayed()->nop();
349
350    return start;
351  }
352
353
354  //----------------------------------------------------------------------------------------------------
355  // Continuation point for runtime calls returning with a pending exception
356  // The pending exception check happened in the runtime or native call stub
357  // The pending exception in Thread is converted into a Java-level exception
358  //
359  // Contract with Java-level exception handler: O0 = exception
360  //                                             O1 = throwing pc
361
362  address generate_forward_exception() {
363    StubCodeMark mark(this, "StubRoutines", "forward_exception");
364    address start = __ pc();
365
366    // Upon entry, O7 has the return address returning into Java
367    // (interpreted or compiled) code; i.e. the return address
368    // becomes the throwing pc.
369
370    const Register& handler_reg = Gtemp;
371
372    Address exception_addr(G2_thread, Thread::pending_exception_offset());
373
374#ifdef ASSERT
375    // make sure that this code is only executed if there is a pending exception
376    { Label L;
377      __ ld_ptr(exception_addr, Gtemp);
378      __ br_notnull_short(Gtemp, Assembler::pt, L);
379      __ stop("StubRoutines::forward exception: no pending exception (1)");
380      __ bind(L);
381    }
382#endif
383
384    // compute exception handler into handler_reg
385    __ get_thread();
386    __ ld_ptr(exception_addr, Oexception);
387    __ verify_oop(Oexception);
388    __ save_frame(0);             // compensates for compiler weakness
389    __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
390    BLOCK_COMMENT("call exception_handler_for_return_address");
391    __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
392    __ mov(O0, handler_reg);
393    __ restore();                 // compensates for compiler weakness
394
395    __ ld_ptr(exception_addr, Oexception);
396    __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
397
398#ifdef ASSERT
399    // make sure exception is set
400    { Label L;
401      __ br_notnull_short(Oexception, Assembler::pt, L);
402      __ stop("StubRoutines::forward exception: no pending exception (2)");
403      __ bind(L);
404    }
405#endif
406    // jump to exception handler
407    __ jmp(handler_reg, 0);
408    // clear pending exception
409    __ delayed()->st_ptr(G0, exception_addr);
410
411    return start;
412  }
413
414
415  //------------------------------------------------------------------------------------------------------------------------
416  // Continuation point for throwing of implicit exceptions that are not handled in
417  // the current activation. Fabricates an exception oop and initiates normal
418  // exception dispatching in this frame. Only callee-saved registers are preserved
419  // (through the normal register window / RegisterMap handling).
420  // If the compiler needs all registers to be preserved between the fault
421  // point and the exception handler then it must assume responsibility for that in
422  // AbstractCompiler::continuation_for_implicit_null_exception or
423  // continuation_for_implicit_division_by_zero_exception. All other implicit
424  // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
425  // either at call sites or otherwise assume that stack unwinding will be initiated,
426  // so caller saved registers were assumed volatile in the compiler.
427
428  // Note that we generate only this stub into a RuntimeStub, because it needs to be
429  // properly traversed and ignored during GC, so we change the meaning of the "__"
430  // macro within this method.
431#undef __
432#define __ masm->
433
434  address generate_throw_exception(const char* name, address runtime_entry,
435                                   Register arg1 = noreg, Register arg2 = noreg) {
436#ifdef ASSERT
437    int insts_size = VerifyThread ? 1 * K : 600;
438#else
439    int insts_size = VerifyThread ? 1 * K : 256;
440#endif /* ASSERT */
441    int locs_size  = 32;
442
443    CodeBuffer      code(name, insts_size, locs_size);
444    MacroAssembler* masm = new MacroAssembler(&code);
445
446    __ verify_thread();
447
448    // This is an inlined and slightly modified version of call_VM
449    // which has the ability to fetch the return PC out of thread-local storage
450    __ assert_not_delayed();
451
452    // Note that we always push a frame because on the SPARC
453    // architecture, for all of our implicit exception kinds at call
454    // sites, the implicit exception is taken before the callee frame
455    // is pushed.
456    __ save_frame(0);
457
458    int frame_complete = __ offset();
459
460    // Note that we always have a runtime stub frame on the top of stack by this point
461    Register last_java_sp = SP;
462    // 64-bit last_java_sp is biased!
463    __ set_last_Java_frame(last_java_sp, G0);
464    if (VerifyThread)  __ mov(G2_thread, O0); // about to be smashed; pass early
465    __ save_thread(noreg);
466    if (arg1 != noreg) {
467      assert(arg2 != O1, "clobbered");
468      __ mov(arg1, O1);
469    }
470    if (arg2 != noreg) {
471      __ mov(arg2, O2);
472    }
473    // do the call
474    BLOCK_COMMENT("call runtime_entry");
475    __ call(runtime_entry, relocInfo::runtime_call_type);
476    if (!VerifyThread)
477      __ delayed()->mov(G2_thread, O0);  // pass thread as first argument
478    else
479      __ delayed()->nop();             // (thread already passed)
480    __ restore_thread(noreg);
481    __ reset_last_Java_frame();
482
483    // check for pending exceptions. use Gtemp as scratch register.
484#ifdef ASSERT
485    Label L;
486
487    Address exception_addr(G2_thread, Thread::pending_exception_offset());
488    Register scratch_reg = Gtemp;
489    __ ld_ptr(exception_addr, scratch_reg);
490    __ br_notnull_short(scratch_reg, Assembler::pt, L);
491    __ should_not_reach_here();
492    __ bind(L);
493#endif // ASSERT
494    BLOCK_COMMENT("call forward_exception_entry");
495    __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
496    // we use O7 linkage so that forward_exception_entry has the issuing PC
497    __ delayed()->restore();
498
499    RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
500    return stub->entry_point();
501  }
502
503#undef __
504#define __ _masm->
505
506
507  // Generate a routine that sets all the registers so we
508  // can tell if the stop routine prints them correctly.
509  address generate_test_stop() {
510    StubCodeMark mark(this, "StubRoutines", "test_stop");
511    address start = __ pc();
512
513    int i;
514
515    __ save_frame(0);
516
517    static jfloat zero = 0.0, one = 1.0;
518
519    // put addr in L0, then load through L0 to F0
520    __ set((intptr_t)&zero, L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F0);
521    __ set((intptr_t)&one,  L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
522
523    // use add to put 2..18 in F2..F18
524    for ( i = 2;  i <= 18;  ++i ) {
525      __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1),  as_FloatRegister(i));
526    }
527
528    // Now put double 2 in F16, double 18 in F18
529    __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
530    __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
531
532    // use add to put 20..32 in F20..F32
533    for (i = 20; i < 32; i += 2) {
534      __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2),  as_FloatRegister(i));
535    }
536
537    // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
538    for ( i = 0; i < 8; ++i ) {
539      if (i < 6) {
540        __ set(     i, as_iRegister(i));
541        __ set(16 + i, as_oRegister(i));
542        __ set(24 + i, as_gRegister(i));
543      }
544      __ set( 8 + i, as_lRegister(i));
545    }
546
547    __ stop("testing stop");
548
549
550    __ ret();
551    __ delayed()->restore();
552
553    return start;
554  }
555
556
557  address generate_stop_subroutine() {
558    StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
559    address start = __ pc();
560
561    __ stop_subroutine();
562
563    return start;
564  }
565
566  address generate_flush_callers_register_windows() {
567    StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
568    address start = __ pc();
569
570    __ flush_windows();
571    __ retl(false);
572    __ delayed()->add( FP, STACK_BIAS, O0 );
573    // The returned value must be a stack pointer whose register save area
574    // is flushed, and will stay flushed while the caller executes.
575
576    return start;
577  }
578
579  // Helper functions for v8 atomic operations.
580  //
581  void get_v8_oop_lock_ptr(Register lock_ptr_reg, Register mark_oop_reg, Register scratch_reg) {
582    if (mark_oop_reg == noreg) {
583      address lock_ptr = (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr();
584      __ set((intptr_t)lock_ptr, lock_ptr_reg);
585    } else {
586      assert(scratch_reg != noreg, "just checking");
587      address lock_ptr = (address)StubRoutines::Sparc::_v8_oop_lock_cache;
588      __ set((intptr_t)lock_ptr, lock_ptr_reg);
589      __ and3(mark_oop_reg, StubRoutines::Sparc::v8_oop_lock_mask_in_place, scratch_reg);
590      __ add(lock_ptr_reg, scratch_reg, lock_ptr_reg);
591    }
592  }
593
594  void generate_v8_lock_prologue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
595
596    get_v8_oop_lock_ptr(lock_ptr_reg, mark_oop_reg, scratch_reg);
597    __ set(StubRoutines::Sparc::locked, lock_reg);
598    // Initialize yield counter
599    __ mov(G0,yield_reg);
600
601    __ BIND(retry);
602    __ cmp_and_br_short(yield_reg, V8AtomicOperationUnderLockSpinCount, Assembler::less, Assembler::pt, dontyield);
603
604    // This code can only be called from inside the VM, this
605    // stub is only invoked from Atomic::add().  We do not
606    // want to use call_VM, because _last_java_sp and such
607    // must already be set.
608    //
609    // Save the regs and make space for a C call
610    __ save(SP, -96, SP);
611    __ save_all_globals_into_locals();
612    BLOCK_COMMENT("call os::naked_sleep");
613    __ call(CAST_FROM_FN_PTR(address, os::naked_sleep));
614    __ delayed()->nop();
615    __ restore_globals_from_locals();
616    __ restore();
617    // reset the counter
618    __ mov(G0,yield_reg);
619
620    __ BIND(dontyield);
621
622    // try to get lock
623    __ swap(lock_ptr_reg, 0, lock_reg);
624
625    // did we get the lock?
626    __ cmp(lock_reg, StubRoutines::Sparc::unlocked);
627    __ br(Assembler::notEqual, true, Assembler::pn, retry);
628    __ delayed()->add(yield_reg,1,yield_reg);
629
630    // yes, got lock. do the operation here.
631  }
632
633  void generate_v8_lock_epilogue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
634    __ st(lock_reg, lock_ptr_reg, 0); // unlock
635  }
636
637  // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
638  //
639  // Arguments :
640  //
641  //      exchange_value: O0
642  //      dest:           O1
643  //
644  // Results:
645  //
646  //     O0: the value previously stored in dest
647  //
648  address generate_atomic_xchg() {
649    StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
650    address start = __ pc();
651
652    if (UseCASForSwap) {
653      // Use CAS instead of swap, just in case the MP hardware
654      // prefers to work with just one kind of synch. instruction.
655      Label retry;
656      __ BIND(retry);
657      __ mov(O0, O3);       // scratch copy of exchange value
658      __ ld(O1, 0, O2);     // observe the previous value
659      // try to replace O2 with O3
660      __ cas_under_lock(O1, O2, O3,
661      (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
662      __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
663
664      __ retl(false);
665      __ delayed()->mov(O2, O0);  // report previous value to caller
666
667    } else {
668      if (VM_Version::v9_instructions_work()) {
669        __ retl(false);
670        __ delayed()->swap(O1, 0, O0);
671      } else {
672        const Register& lock_reg = O2;
673        const Register& lock_ptr_reg = O3;
674        const Register& yield_reg = O4;
675
676        Label retry;
677        Label dontyield;
678
679        generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
680        // got the lock, do the swap
681        __ swap(O1, 0, O0);
682
683        generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
684        __ retl(false);
685        __ delayed()->nop();
686      }
687    }
688
689    return start;
690  }
691
692
693  // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
694  //
695  // Arguments :
696  //
697  //      exchange_value: O0
698  //      dest:           O1
699  //      compare_value:  O2
700  //
701  // Results:
702  //
703  //     O0: the value previously stored in dest
704  //
705  // Overwrites (v8): O3,O4,O5
706  //
707  address generate_atomic_cmpxchg() {
708    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
709    address start = __ pc();
710
711    // cmpxchg(dest, compare_value, exchange_value)
712    __ cas_under_lock(O1, O2, O0,
713      (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
714    __ retl(false);
715    __ delayed()->nop();
716
717    return start;
718  }
719
720  // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
721  //
722  // Arguments :
723  //
724  //      exchange_value: O1:O0
725  //      dest:           O2
726  //      compare_value:  O4:O3
727  //
728  // Results:
729  //
730  //     O1:O0: the value previously stored in dest
731  //
732  // This only works on V9, on V8 we don't generate any
733  // code and just return NULL.
734  //
735  // Overwrites: G1,G2,G3
736  //
737  address generate_atomic_cmpxchg_long() {
738    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
739    address start = __ pc();
740
741    if (!VM_Version::supports_cx8())
742        return NULL;;
743    __ sllx(O0, 32, O0);
744    __ srl(O1, 0, O1);
745    __ or3(O0,O1,O0);      // O0 holds 64-bit value from compare_value
746    __ sllx(O3, 32, O3);
747    __ srl(O4, 0, O4);
748    __ or3(O3,O4,O3);     // O3 holds 64-bit value from exchange_value
749    __ casx(O2, O3, O0);
750    __ srl(O0, 0, O1);    // unpacked return value in O1:O0
751    __ retl(false);
752    __ delayed()->srlx(O0, 32, O0);
753
754    return start;
755  }
756
757
758  // Support for jint Atomic::add(jint add_value, volatile jint* dest).
759  //
760  // Arguments :
761  //
762  //      add_value: O0   (e.g., +1 or -1)
763  //      dest:      O1
764  //
765  // Results:
766  //
767  //     O0: the new value stored in dest
768  //
769  // Overwrites (v9): O3
770  // Overwrites (v8): O3,O4,O5
771  //
772  address generate_atomic_add() {
773    StubCodeMark mark(this, "StubRoutines", "atomic_add");
774    address start = __ pc();
775    __ BIND(_atomic_add_stub);
776
777    if (VM_Version::v9_instructions_work()) {
778      Label(retry);
779      __ BIND(retry);
780
781      __ lduw(O1, 0, O2);
782      __ add(O0, O2, O3);
783      __ cas(O1, O2, O3);
784      __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
785      __ retl(false);
786      __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
787    } else {
788      const Register& lock_reg = O2;
789      const Register& lock_ptr_reg = O3;
790      const Register& value_reg = O4;
791      const Register& yield_reg = O5;
792
793      Label(retry);
794      Label(dontyield);
795
796      generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
797      // got lock, do the increment
798      __ ld(O1, 0, value_reg);
799      __ add(O0, value_reg, value_reg);
800      __ st(value_reg, O1, 0);
801
802      // %%% only for RMO and PSO
803      __ membar(Assembler::StoreStore);
804
805      generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
806
807      __ retl(false);
808      __ delayed()->mov(value_reg, O0);
809    }
810
811    return start;
812  }
813  Label _atomic_add_stub;  // called from other stubs
814
815
816  //------------------------------------------------------------------------------------------------------------------------
817  // The following routine generates a subroutine to throw an asynchronous
818  // UnknownError when an unsafe access gets a fault that could not be
819  // reasonably prevented by the programmer.  (Example: SIGBUS/OBJERR.)
820  //
821  // Arguments :
822  //
823  //      trapping PC:    O7
824  //
825  // Results:
826  //     posts an asynchronous exception, skips the trapping instruction
827  //
828
829  address generate_handler_for_unsafe_access() {
830    StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
831    address start = __ pc();
832
833    const int preserve_register_words = (64 * 2);
834    Address preserve_addr(FP, (-preserve_register_words * wordSize) + STACK_BIAS);
835
836    Register Lthread = L7_thread_cache;
837    int i;
838
839    __ save_frame(0);
840    __ mov(G1, L1);
841    __ mov(G2, L2);
842    __ mov(G3, L3);
843    __ mov(G4, L4);
844    __ mov(G5, L5);
845    for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
846      __ stf(FloatRegisterImpl::D, as_FloatRegister(i), preserve_addr, i * wordSize);
847    }
848
849    address entry_point = CAST_FROM_FN_PTR(address, handle_unsafe_access);
850    BLOCK_COMMENT("call handle_unsafe_access");
851    __ call(entry_point, relocInfo::runtime_call_type);
852    __ delayed()->nop();
853
854    __ mov(L1, G1);
855    __ mov(L2, G2);
856    __ mov(L3, G3);
857    __ mov(L4, G4);
858    __ mov(L5, G5);
859    for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
860      __ ldf(FloatRegisterImpl::D, preserve_addr, as_FloatRegister(i), i * wordSize);
861    }
862
863    __ verify_thread();
864
865    __ jmp(O0, 0);
866    __ delayed()->restore();
867
868    return start;
869  }
870
871
872  // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
873  // Arguments :
874  //
875  //      ret  : O0, returned
876  //      icc/xcc: set as O0 (depending on wordSize)
877  //      sub  : O1, argument, not changed
878  //      super: O2, argument, not changed
879  //      raddr: O7, blown by call
880  address generate_partial_subtype_check() {
881    __ align(CodeEntryAlignment);
882    StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
883    address start = __ pc();
884    Label miss;
885
886#if defined(COMPILER2) && !defined(_LP64)
887    // Do not use a 'save' because it blows the 64-bit O registers.
888    __ add(SP,-4*wordSize,SP);  // Make space for 4 temps (stack must be 2 words aligned)
889    __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize);
890    __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize);
891    __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize);
892    __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize);
893    Register Rret   = O0;
894    Register Rsub   = O1;
895    Register Rsuper = O2;
896#else
897    __ save_frame(0);
898    Register Rret   = I0;
899    Register Rsub   = I1;
900    Register Rsuper = I2;
901#endif
902
903    Register L0_ary_len = L0;
904    Register L1_ary_ptr = L1;
905    Register L2_super   = L2;
906    Register L3_index   = L3;
907
908    __ check_klass_subtype_slow_path(Rsub, Rsuper,
909                                     L0, L1, L2, L3,
910                                     NULL, &miss);
911
912    // Match falls through here.
913    __ addcc(G0,0,Rret);        // set Z flags, Z result
914
915#if defined(COMPILER2) && !defined(_LP64)
916    __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
917    __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
918    __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
919    __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
920    __ retl();                  // Result in Rret is zero; flags set to Z
921    __ delayed()->add(SP,4*wordSize,SP);
922#else
923    __ ret();                   // Result in Rret is zero; flags set to Z
924    __ delayed()->restore();
925#endif
926
927    __ BIND(miss);
928    __ addcc(G0,1,Rret);        // set NZ flags, NZ result
929
930#if defined(COMPILER2) && !defined(_LP64)
931    __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
932    __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
933    __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
934    __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
935    __ retl();                  // Result in Rret is != 0; flags set to NZ
936    __ delayed()->add(SP,4*wordSize,SP);
937#else
938    __ ret();                   // Result in Rret is != 0; flags set to NZ
939    __ delayed()->restore();
940#endif
941
942    return start;
943  }
944
945
946  // Called from MacroAssembler::verify_oop
947  //
948  address generate_verify_oop_subroutine() {
949    StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
950
951    address start = __ pc();
952
953    __ verify_oop_subroutine();
954
955    return start;
956  }
957
958
959  //
960  // Verify that a register contains clean 32-bits positive value
961  // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
962  //
963  //  Input:
964  //    Rint  -  32-bits value
965  //    Rtmp  -  scratch
966  //
967  void assert_clean_int(Register Rint, Register Rtmp) {
968#if defined(ASSERT) && defined(_LP64)
969    __ signx(Rint, Rtmp);
970    __ cmp(Rint, Rtmp);
971    __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
972#endif
973  }
974
975  //
976  //  Generate overlap test for array copy stubs
977  //
978  //  Input:
979  //    O0    -  array1
980  //    O1    -  array2
981  //    O2    -  element count
982  //
983  //  Kills temps:  O3, O4
984  //
985  void array_overlap_test(address no_overlap_target, int log2_elem_size) {
986    assert(no_overlap_target != NULL, "must be generated");
987    array_overlap_test(no_overlap_target, NULL, log2_elem_size);
988  }
989  void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
990    array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
991  }
992  void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
993    const Register from       = O0;
994    const Register to         = O1;
995    const Register count      = O2;
996    const Register to_from    = O3; // to - from
997    const Register byte_count = O4; // count << log2_elem_size
998
999      __ subcc(to, from, to_from);
1000      __ sll_ptr(count, log2_elem_size, byte_count);
1001      if (NOLp == NULL)
1002        __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
1003      else
1004        __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
1005      __ delayed()->cmp(to_from, byte_count);
1006      if (NOLp == NULL)
1007        __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
1008      else
1009        __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
1010      __ delayed()->nop();
1011  }
1012
1013  //
1014  //  Generate pre-write barrier for array.
1015  //
1016  //  Input:
1017  //     addr     - register containing starting address
1018  //     count    - register containing element count
1019  //     tmp      - scratch register
1020  //
1021  //  The input registers are overwritten.
1022  //
1023  void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
1024    BarrierSet* bs = Universe::heap()->barrier_set();
1025    switch (bs->kind()) {
1026      case BarrierSet::G1SATBCT:
1027      case BarrierSet::G1SATBCTLogging:
1028        // With G1, don't generate the call if we statically know that the target in uninitialized
1029        if (!dest_uninitialized) {
1030          __ save_frame(0);
1031          // Save the necessary global regs... will be used after.
1032          if (addr->is_global()) {
1033            __ mov(addr, L0);
1034          }
1035          if (count->is_global()) {
1036            __ mov(count, L1);
1037          }
1038          __ mov(addr->after_save(), O0);
1039          // Get the count into O1
1040          __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
1041          __ delayed()->mov(count->after_save(), O1);
1042          if (addr->is_global()) {
1043            __ mov(L0, addr);
1044          }
1045          if (count->is_global()) {
1046            __ mov(L1, count);
1047          }
1048          __ restore();
1049        }
1050        break;
1051      case BarrierSet::CardTableModRef:
1052      case BarrierSet::CardTableExtension:
1053      case BarrierSet::ModRef:
1054        break;
1055      default:
1056        ShouldNotReachHere();
1057    }
1058  }
1059  //
1060  //  Generate post-write barrier for array.
1061  //
1062  //  Input:
1063  //     addr     - register containing starting address
1064  //     count    - register containing element count
1065  //     tmp      - scratch register
1066  //
1067  //  The input registers are overwritten.
1068  //
1069  void gen_write_ref_array_post_barrier(Register addr, Register count,
1070                                        Register tmp) {
1071    BarrierSet* bs = Universe::heap()->barrier_set();
1072
1073    switch (bs->kind()) {
1074      case BarrierSet::G1SATBCT:
1075      case BarrierSet::G1SATBCTLogging:
1076        {
1077          // Get some new fresh output registers.
1078          __ save_frame(0);
1079          __ mov(addr->after_save(), O0);
1080          __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
1081          __ delayed()->mov(count->after_save(), O1);
1082          __ restore();
1083        }
1084        break;
1085      case BarrierSet::CardTableModRef:
1086      case BarrierSet::CardTableExtension:
1087        {
1088          CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1089          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1090          assert_different_registers(addr, count, tmp);
1091
1092          Label L_loop;
1093
1094          __ sll_ptr(count, LogBytesPerHeapOop, count);
1095          __ sub(count, BytesPerHeapOop, count);
1096          __ add(count, addr, count);
1097          // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
1098          __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
1099          __ srl_ptr(count, CardTableModRefBS::card_shift, count);
1100          __ sub(count, addr, count);
1101          AddressLiteral rs(ct->byte_map_base);
1102          __ set(rs, tmp);
1103        __ BIND(L_loop);
1104          __ stb(G0, tmp, addr);
1105          __ subcc(count, 1, count);
1106          __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1107          __ delayed()->add(addr, 1, addr);
1108        }
1109        break;
1110      case BarrierSet::ModRef:
1111        break;
1112      default:
1113        ShouldNotReachHere();
1114    }
1115  }
1116
1117  //
1118  // Generate main code for disjoint arraycopy
1119  //
1120  typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
1121                                              Label& L_loop, bool use_prefetch, bool use_bis);
1122
1123  void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
1124                          int iter_size, CopyLoopFunc copy_loop_func) {
1125    Label L_copy;
1126
1127    assert(log2_elem_size <= 3, "the following code should be changed");
1128    int count_dec = 16>>log2_elem_size;
1129
1130    int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
1131    assert(prefetch_dist < 4096, "invalid value");
1132    prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
1133    int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
1134
1135    if (UseBlockCopy) {
1136      Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
1137
1138      // 64 bytes tail + bytes copied in one loop iteration
1139      int tail_size = 64 + iter_size;
1140      int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
1141      // Use BIS copy only for big arrays since it requires membar.
1142      __ set(block_copy_count, O4);
1143      __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
1144      // This code is for disjoint source and destination:
1145      //   to <= from || to >= from+count
1146      // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
1147      __ sub(from, to, O4);
1148      __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
1149      __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
1150
1151      __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
1152      // BIS should not be used to copy tail (64 bytes+iter_size)
1153      // to avoid zeroing of following values.
1154      __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
1155
1156      if (prefetch_count > 0) { // rounded up to one iteration count
1157        // Do prefetching only if copy size is bigger
1158        // than prefetch distance.
1159        __ set(prefetch_count, O4);
1160        __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
1161        __ sub(count, prefetch_count, count);
1162
1163        (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
1164        __ add(count, prefetch_count, count); // restore count
1165
1166      } // prefetch_count > 0
1167
1168      (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
1169      __ add(count, (tail_size>>log2_elem_size), count); // restore count
1170
1171      __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
1172      // BIS needs membar.
1173      __ membar(Assembler::StoreLoad);
1174      // Copy tail
1175      __ ba_short(L_copy);
1176
1177      __ BIND(L_skip_block_copy);
1178    } // UseBlockCopy
1179
1180    if (prefetch_count > 0) { // rounded up to one iteration count
1181      // Do prefetching only if copy size is bigger
1182      // than prefetch distance.
1183      __ set(prefetch_count, O4);
1184      __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
1185      __ sub(count, prefetch_count, count);
1186
1187      Label L_copy_prefetch;
1188      (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
1189      __ add(count, prefetch_count, count); // restore count
1190
1191    } // prefetch_count > 0
1192
1193    (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
1194  }
1195
1196
1197
1198  //
1199  // Helper methods for copy_16_bytes_forward_with_shift()
1200  //
1201  void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
1202                                Label& L_loop, bool use_prefetch, bool use_bis) {
1203
1204    const Register left_shift  = G1; // left  shift bit counter
1205    const Register right_shift = G5; // right shift bit counter
1206
1207    __ align(OptoLoopAlignment);
1208    __ BIND(L_loop);
1209    if (use_prefetch) {
1210      if (ArraycopySrcPrefetchDistance > 0) {
1211        __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1212      }
1213      if (ArraycopyDstPrefetchDistance > 0) {
1214        __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1215      }
1216    }
1217    __ ldx(from, 0, O4);
1218    __ ldx(from, 8, G4);
1219    __ inc(to, 16);
1220    __ inc(from, 16);
1221    __ deccc(count, count_dec); // Can we do next iteration after this one?
1222    __ srlx(O4, right_shift, G3);
1223    __ bset(G3, O3);
1224    __ sllx(O4, left_shift,  O4);
1225    __ srlx(G4, right_shift, G3);
1226    __ bset(G3, O4);
1227    if (use_bis) {
1228      __ stxa(O3, to, -16);
1229      __ stxa(O4, to, -8);
1230    } else {
1231      __ stx(O3, to, -16);
1232      __ stx(O4, to, -8);
1233    }
1234    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1235    __ delayed()->sllx(G4, left_shift,  O3);
1236  }
1237
1238  // Copy big chunks forward with shift
1239  //
1240  // Inputs:
1241  //   from      - source arrays
1242  //   to        - destination array aligned to 8-bytes
1243  //   count     - elements count to copy >= the count equivalent to 16 bytes
1244  //   count_dec - elements count's decrement equivalent to 16 bytes
1245  //   L_copy_bytes - copy exit label
1246  //
1247  void copy_16_bytes_forward_with_shift(Register from, Register to,
1248                     Register count, int log2_elem_size, Label& L_copy_bytes) {
1249    Label L_aligned_copy, L_copy_last_bytes;
1250    assert(log2_elem_size <= 3, "the following code should be changed");
1251    int count_dec = 16>>log2_elem_size;
1252
1253    // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1254    __ andcc(from, 7, G1); // misaligned bytes
1255    __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1256    __ delayed()->nop();
1257
1258    const Register left_shift  = G1; // left  shift bit counter
1259    const Register right_shift = G5; // right shift bit counter
1260
1261    __ sll(G1, LogBitsPerByte, left_shift);
1262    __ mov(64, right_shift);
1263    __ sub(right_shift, left_shift, right_shift);
1264
1265    //
1266    // Load 2 aligned 8-bytes chunks and use one from previous iteration
1267    // to form 2 aligned 8-bytes chunks to store.
1268    //
1269    __ dec(count, count_dec);   // Pre-decrement 'count'
1270    __ andn(from, 7, from);     // Align address
1271    __ ldx(from, 0, O3);
1272    __ inc(from, 8);
1273    __ sllx(O3, left_shift,  O3);
1274
1275    disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop);
1276
1277    __ inccc(count, count_dec>>1 ); // + 8 bytes
1278    __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1279    __ delayed()->inc(count, count_dec>>1); // restore 'count'
1280
1281    // copy 8 bytes, part of them already loaded in O3
1282    __ ldx(from, 0, O4);
1283    __ inc(to, 8);
1284    __ inc(from, 8);
1285    __ srlx(O4, right_shift, G3);
1286    __ bset(O3, G3);
1287    __ stx(G3, to, -8);
1288
1289    __ BIND(L_copy_last_bytes);
1290    __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1291    __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1292    __ delayed()->sub(from, right_shift, from);       // restore address
1293
1294    __ BIND(L_aligned_copy);
1295  }
1296
1297  // Copy big chunks backward with shift
1298  //
1299  // Inputs:
1300  //   end_from  - source arrays end address
1301  //   end_to    - destination array end address aligned to 8-bytes
1302  //   count     - elements count to copy >= the count equivalent to 16 bytes
1303  //   count_dec - elements count's decrement equivalent to 16 bytes
1304  //   L_aligned_copy - aligned copy exit label
1305  //   L_copy_bytes   - copy exit label
1306  //
1307  void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
1308                     Register count, int count_dec,
1309                     Label& L_aligned_copy, Label& L_copy_bytes) {
1310    Label L_loop, L_copy_last_bytes;
1311
1312    // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1313      __ andcc(end_from, 7, G1); // misaligned bytes
1314      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1315      __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
1316
1317    const Register left_shift  = G1; // left  shift bit counter
1318    const Register right_shift = G5; // right shift bit counter
1319
1320      __ sll(G1, LogBitsPerByte, left_shift);
1321      __ mov(64, right_shift);
1322      __ sub(right_shift, left_shift, right_shift);
1323
1324    //
1325    // Load 2 aligned 8-bytes chunks and use one from previous iteration
1326    // to form 2 aligned 8-bytes chunks to store.
1327    //
1328      __ andn(end_from, 7, end_from);     // Align address
1329      __ ldx(end_from, 0, O3);
1330      __ align(OptoLoopAlignment);
1331    __ BIND(L_loop);
1332      __ ldx(end_from, -8, O4);
1333      __ deccc(count, count_dec); // Can we do next iteration after this one?
1334      __ ldx(end_from, -16, G4);
1335      __ dec(end_to, 16);
1336      __ dec(end_from, 16);
1337      __ srlx(O3, right_shift, O3);
1338      __ sllx(O4, left_shift,  G3);
1339      __ bset(G3, O3);
1340      __ stx(O3, end_to, 8);
1341      __ srlx(O4, right_shift, O4);
1342      __ sllx(G4, left_shift,  G3);
1343      __ bset(G3, O4);
1344      __ stx(O4, end_to, 0);
1345      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1346      __ delayed()->mov(G4, O3);
1347
1348      __ inccc(count, count_dec>>1 ); // + 8 bytes
1349      __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1350      __ delayed()->inc(count, count_dec>>1); // restore 'count'
1351
1352      // copy 8 bytes, part of them already loaded in O3
1353      __ ldx(end_from, -8, O4);
1354      __ dec(end_to, 8);
1355      __ dec(end_from, 8);
1356      __ srlx(O3, right_shift, O3);
1357      __ sllx(O4, left_shift,  G3);
1358      __ bset(O3, G3);
1359      __ stx(G3, end_to, 0);
1360
1361    __ BIND(L_copy_last_bytes);
1362      __ srl(left_shift, LogBitsPerByte, left_shift);    // misaligned bytes
1363      __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1364      __ delayed()->add(end_from, left_shift, end_from); // restore address
1365  }
1366
1367  //
1368  //  Generate stub for disjoint byte copy.  If "aligned" is true, the
1369  //  "from" and "to" addresses are assumed to be heapword aligned.
1370  //
1371  // Arguments for generated stub:
1372  //      from:  O0
1373  //      to:    O1
1374  //      count: O2 treated as signed
1375  //
1376  address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
1377    __ align(CodeEntryAlignment);
1378    StubCodeMark mark(this, "StubRoutines", name);
1379    address start = __ pc();
1380
1381    Label L_skip_alignment, L_align;
1382    Label L_copy_byte, L_copy_byte_loop, L_exit;
1383
1384    const Register from      = O0;   // source array address
1385    const Register to        = O1;   // destination array address
1386    const Register count     = O2;   // elements count
1387    const Register offset    = O5;   // offset from start of arrays
1388    // O3, O4, G3, G4 are used as temp registers
1389
1390    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1391
1392    if (entry != NULL) {
1393      *entry = __ pc();
1394      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1395      BLOCK_COMMENT("Entry:");
1396    }
1397
1398    // for short arrays, just do single element copy
1399    __ cmp(count, 23); // 16 + 7
1400    __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1401    __ delayed()->mov(G0, offset);
1402
1403    if (aligned) {
1404      // 'aligned' == true when it is known statically during compilation
1405      // of this arraycopy call site that both 'from' and 'to' addresses
1406      // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1407      //
1408      // Aligned arrays have 4 bytes alignment in 32-bits VM
1409      // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
1410      //
1411#ifndef _LP64
1412      // copy a 4-bytes word if necessary to align 'to' to 8 bytes
1413      __ andcc(to, 7, G0);
1414      __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment);
1415      __ delayed()->ld(from, 0, O3);
1416      __ inc(from, 4);
1417      __ inc(to, 4);
1418      __ dec(count, 4);
1419      __ st(O3, to, -4);
1420    __ BIND(L_skip_alignment);
1421#endif
1422    } else {
1423      // copy bytes to align 'to' on 8 byte boundary
1424      __ andcc(to, 7, G1); // misaligned bytes
1425      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1426      __ delayed()->neg(G1);
1427      __ inc(G1, 8);       // bytes need to copy to next 8-bytes alignment
1428      __ sub(count, G1, count);
1429    __ BIND(L_align);
1430      __ ldub(from, 0, O3);
1431      __ deccc(G1);
1432      __ inc(from);
1433      __ stb(O3, to, 0);
1434      __ br(Assembler::notZero, false, Assembler::pt, L_align);
1435      __ delayed()->inc(to);
1436    __ BIND(L_skip_alignment);
1437    }
1438#ifdef _LP64
1439    if (!aligned)
1440#endif
1441    {
1442      // Copy with shift 16 bytes per iteration if arrays do not have
1443      // the same alignment mod 8, otherwise fall through to the next
1444      // code for aligned copy.
1445      // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1446      // Also jump over aligned copy after the copy with shift completed.
1447
1448      copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
1449    }
1450
1451    // Both array are 8 bytes aligned, copy 16 bytes at a time
1452      __ and3(count, 7, G4); // Save count
1453      __ srl(count, 3, count);
1454     generate_disjoint_long_copy_core(aligned);
1455      __ mov(G4, count);     // Restore count
1456
1457    // copy tailing bytes
1458    __ BIND(L_copy_byte);
1459      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1460      __ align(OptoLoopAlignment);
1461    __ BIND(L_copy_byte_loop);
1462      __ ldub(from, offset, O3);
1463      __ deccc(count);
1464      __ stb(O3, to, offset);
1465      __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1466      __ delayed()->inc(offset);
1467
1468    __ BIND(L_exit);
1469      // O3, O4 are used as temp registers
1470      inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1471      __ retl();
1472      __ delayed()->mov(G0, O0); // return 0
1473    return start;
1474  }
1475
1476  //
1477  //  Generate stub for conjoint byte copy.  If "aligned" is true, the
1478  //  "from" and "to" addresses are assumed to be heapword aligned.
1479  //
1480  // Arguments for generated stub:
1481  //      from:  O0
1482  //      to:    O1
1483  //      count: O2 treated as signed
1484  //
1485  address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1486                                      address *entry, const char *name) {
1487    // Do reverse copy.
1488
1489    __ align(CodeEntryAlignment);
1490    StubCodeMark mark(this, "StubRoutines", name);
1491    address start = __ pc();
1492
1493    Label L_skip_alignment, L_align, L_aligned_copy;
1494    Label L_copy_byte, L_copy_byte_loop, L_exit;
1495
1496    const Register from      = O0;   // source array address
1497    const Register to        = O1;   // destination array address
1498    const Register count     = O2;   // elements count
1499    const Register end_from  = from; // source array end address
1500    const Register end_to    = to;   // destination array end address
1501
1502    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1503
1504    if (entry != NULL) {
1505      *entry = __ pc();
1506      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1507      BLOCK_COMMENT("Entry:");
1508    }
1509
1510    array_overlap_test(nooverlap_target, 0);
1511
1512    __ add(to, count, end_to);       // offset after last copied element
1513
1514    // for short arrays, just do single element copy
1515    __ cmp(count, 23); // 16 + 7
1516    __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1517    __ delayed()->add(from, count, end_from);
1518
1519    {
1520      // Align end of arrays since they could be not aligned even
1521      // when arrays itself are aligned.
1522
1523      // copy bytes to align 'end_to' on 8 byte boundary
1524      __ andcc(end_to, 7, G1); // misaligned bytes
1525      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1526      __ delayed()->nop();
1527      __ sub(count, G1, count);
1528    __ BIND(L_align);
1529      __ dec(end_from);
1530      __ dec(end_to);
1531      __ ldub(end_from, 0, O3);
1532      __ deccc(G1);
1533      __ brx(Assembler::notZero, false, Assembler::pt, L_align);
1534      __ delayed()->stb(O3, end_to, 0);
1535    __ BIND(L_skip_alignment);
1536    }
1537#ifdef _LP64
1538    if (aligned) {
1539      // Both arrays are aligned to 8-bytes in 64-bits VM.
1540      // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1541      // in unaligned case.
1542      __ dec(count, 16);
1543    } else
1544#endif
1545    {
1546      // Copy with shift 16 bytes per iteration if arrays do not have
1547      // the same alignment mod 8, otherwise jump to the next
1548      // code for aligned copy (and substracting 16 from 'count' before jump).
1549      // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1550      // Also jump over aligned copy after the copy with shift completed.
1551
1552      copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
1553                                        L_aligned_copy, L_copy_byte);
1554    }
1555    // copy 4 elements (16 bytes) at a time
1556      __ align(OptoLoopAlignment);
1557    __ BIND(L_aligned_copy);
1558      __ dec(end_from, 16);
1559      __ ldx(end_from, 8, O3);
1560      __ ldx(end_from, 0, O4);
1561      __ dec(end_to, 16);
1562      __ deccc(count, 16);
1563      __ stx(O3, end_to, 8);
1564      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1565      __ delayed()->stx(O4, end_to, 0);
1566      __ inc(count, 16);
1567
1568    // copy 1 element (2 bytes) at a time
1569    __ BIND(L_copy_byte);
1570      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1571      __ align(OptoLoopAlignment);
1572    __ BIND(L_copy_byte_loop);
1573      __ dec(end_from);
1574      __ dec(end_to);
1575      __ ldub(end_from, 0, O4);
1576      __ deccc(count);
1577      __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
1578      __ delayed()->stb(O4, end_to, 0);
1579
1580    __ BIND(L_exit);
1581    // O3, O4 are used as temp registers
1582    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1583    __ retl();
1584    __ delayed()->mov(G0, O0); // return 0
1585    return start;
1586  }
1587
1588  //
1589  //  Generate stub for disjoint short copy.  If "aligned" is true, the
1590  //  "from" and "to" addresses are assumed to be heapword aligned.
1591  //
1592  // Arguments for generated stub:
1593  //      from:  O0
1594  //      to:    O1
1595  //      count: O2 treated as signed
1596  //
1597  address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
1598    __ align(CodeEntryAlignment);
1599    StubCodeMark mark(this, "StubRoutines", name);
1600    address start = __ pc();
1601
1602    Label L_skip_alignment, L_skip_alignment2;
1603    Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1604
1605    const Register from      = O0;   // source array address
1606    const Register to        = O1;   // destination array address
1607    const Register count     = O2;   // elements count
1608    const Register offset    = O5;   // offset from start of arrays
1609    // O3, O4, G3, G4 are used as temp registers
1610
1611    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1612
1613    if (entry != NULL) {
1614      *entry = __ pc();
1615      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1616      BLOCK_COMMENT("Entry:");
1617    }
1618
1619    // for short arrays, just do single element copy
1620    __ cmp(count, 11); // 8 + 3  (22 bytes)
1621    __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1622    __ delayed()->mov(G0, offset);
1623
1624    if (aligned) {
1625      // 'aligned' == true when it is known statically during compilation
1626      // of this arraycopy call site that both 'from' and 'to' addresses
1627      // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1628      //
1629      // Aligned arrays have 4 bytes alignment in 32-bits VM
1630      // and 8 bytes - in 64-bits VM.
1631      //
1632#ifndef _LP64
1633      // copy a 2-elements word if necessary to align 'to' to 8 bytes
1634      __ andcc(to, 7, G0);
1635      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1636      __ delayed()->ld(from, 0, O3);
1637      __ inc(from, 4);
1638      __ inc(to, 4);
1639      __ dec(count, 2);
1640      __ st(O3, to, -4);
1641    __ BIND(L_skip_alignment);
1642#endif
1643    } else {
1644      // copy 1 element if necessary to align 'to' on an 4 bytes
1645      __ andcc(to, 3, G0);
1646      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1647      __ delayed()->lduh(from, 0, O3);
1648      __ inc(from, 2);
1649      __ inc(to, 2);
1650      __ dec(count);
1651      __ sth(O3, to, -2);
1652    __ BIND(L_skip_alignment);
1653
1654      // copy 2 elements to align 'to' on an 8 byte boundary
1655      __ andcc(to, 7, G0);
1656      __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1657      __ delayed()->lduh(from, 0, O3);
1658      __ dec(count, 2);
1659      __ lduh(from, 2, O4);
1660      __ inc(from, 4);
1661      __ inc(to, 4);
1662      __ sth(O3, to, -4);
1663      __ sth(O4, to, -2);
1664    __ BIND(L_skip_alignment2);
1665    }
1666#ifdef _LP64
1667    if (!aligned)
1668#endif
1669    {
1670      // Copy with shift 16 bytes per iteration if arrays do not have
1671      // the same alignment mod 8, otherwise fall through to the next
1672      // code for aligned copy.
1673      // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1674      // Also jump over aligned copy after the copy with shift completed.
1675
1676      copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
1677    }
1678
1679    // Both array are 8 bytes aligned, copy 16 bytes at a time
1680      __ and3(count, 3, G4); // Save
1681      __ srl(count, 2, count);
1682     generate_disjoint_long_copy_core(aligned);
1683      __ mov(G4, count); // restore
1684
1685    // copy 1 element at a time
1686    __ BIND(L_copy_2_bytes);
1687      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1688      __ align(OptoLoopAlignment);
1689    __ BIND(L_copy_2_bytes_loop);
1690      __ lduh(from, offset, O3);
1691      __ deccc(count);
1692      __ sth(O3, to, offset);
1693      __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1694      __ delayed()->inc(offset, 2);
1695
1696    __ BIND(L_exit);
1697      // O3, O4 are used as temp registers
1698      inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1699      __ retl();
1700      __ delayed()->mov(G0, O0); // return 0
1701    return start;
1702  }
1703
1704  //
1705  //  Generate stub for disjoint short fill.  If "aligned" is true, the
1706  //  "to" address is assumed to be heapword aligned.
1707  //
1708  // Arguments for generated stub:
1709  //      to:    O0
1710  //      value: O1
1711  //      count: O2 treated as signed
1712  //
1713  address generate_fill(BasicType t, bool aligned, const char* name) {
1714    __ align(CodeEntryAlignment);
1715    StubCodeMark mark(this, "StubRoutines", name);
1716    address start = __ pc();
1717
1718    const Register to        = O0;   // source array address
1719    const Register value     = O1;   // fill value
1720    const Register count     = O2;   // elements count
1721    // O3 is used as a temp register
1722
1723    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1724
1725    Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
1726    Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
1727
1728    int shift = -1;
1729    switch (t) {
1730       case T_BYTE:
1731        shift = 2;
1732        break;
1733       case T_SHORT:
1734        shift = 1;
1735        break;
1736      case T_INT:
1737         shift = 0;
1738        break;
1739      default: ShouldNotReachHere();
1740    }
1741
1742    BLOCK_COMMENT("Entry:");
1743
1744    if (t == T_BYTE) {
1745      // Zero extend value
1746      __ and3(value, 0xff, value);
1747      __ sllx(value, 8, O3);
1748      __ or3(value, O3, value);
1749    }
1750    if (t == T_SHORT) {
1751      // Zero extend value
1752      __ sllx(value, 48, value);
1753      __ srlx(value, 48, value);
1754    }
1755    if (t == T_BYTE || t == T_SHORT) {
1756      __ sllx(value, 16, O3);
1757      __ or3(value, O3, value);
1758    }
1759
1760    __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
1761    __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
1762    __ delayed()->andcc(count, 1, G0);
1763
1764    if (!aligned && (t == T_BYTE || t == T_SHORT)) {
1765      // align source address at 4 bytes address boundary
1766      if (t == T_BYTE) {
1767        // One byte misalignment happens only for byte arrays
1768        __ andcc(to, 1, G0);
1769        __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
1770        __ delayed()->nop();
1771        __ stb(value, to, 0);
1772        __ inc(to, 1);
1773        __ dec(count, 1);
1774        __ BIND(L_skip_align1);
1775      }
1776      // Two bytes misalignment happens only for byte and short (char) arrays
1777      __ andcc(to, 2, G0);
1778      __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
1779      __ delayed()->nop();
1780      __ sth(value, to, 0);
1781      __ inc(to, 2);
1782      __ dec(count, 1 << (shift - 1));
1783      __ BIND(L_skip_align2);
1784    }
1785#ifdef _LP64
1786    if (!aligned) {
1787#endif
1788    // align to 8 bytes, we know we are 4 byte aligned to start
1789    __ andcc(to, 7, G0);
1790    __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
1791    __ delayed()->nop();
1792    __ stw(value, to, 0);
1793    __ inc(to, 4);
1794    __ dec(count, 1 << shift);
1795    __ BIND(L_fill_32_bytes);
1796#ifdef _LP64
1797    }
1798#endif
1799
1800    if (t == T_INT) {
1801      // Zero extend value
1802      __ srl(value, 0, value);
1803    }
1804    if (t == T_BYTE || t == T_SHORT || t == T_INT) {
1805      __ sllx(value, 32, O3);
1806      __ or3(value, O3, value);
1807    }
1808
1809    Label L_check_fill_8_bytes;
1810    // Fill 32-byte chunks
1811    __ subcc(count, 8 << shift, count);
1812    __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
1813    __ delayed()->nop();
1814
1815    Label L_fill_32_bytes_loop, L_fill_4_bytes;
1816    __ align(16);
1817    __ BIND(L_fill_32_bytes_loop);
1818
1819    __ stx(value, to, 0);
1820    __ stx(value, to, 8);
1821    __ stx(value, to, 16);
1822    __ stx(value, to, 24);
1823
1824    __ subcc(count, 8 << shift, count);
1825    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
1826    __ delayed()->add(to, 32, to);
1827
1828    __ BIND(L_check_fill_8_bytes);
1829    __ addcc(count, 8 << shift, count);
1830    __ brx(Assembler::zero, false, Assembler::pn, L_exit);
1831    __ delayed()->subcc(count, 1 << (shift + 1), count);
1832    __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
1833    __ delayed()->andcc(count, 1<<shift, G0);
1834
1835    //
1836    // length is too short, just fill 8 bytes at a time
1837    //
1838    Label L_fill_8_bytes_loop;
1839    __ BIND(L_fill_8_bytes_loop);
1840    __ stx(value, to, 0);
1841    __ subcc(count, 1 << (shift + 1), count);
1842    __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
1843    __ delayed()->add(to, 8, to);
1844
1845    // fill trailing 4 bytes
1846    __ andcc(count, 1<<shift, G0);  // in delay slot of branches
1847    if (t == T_INT) {
1848      __ BIND(L_fill_elements);
1849    }
1850    __ BIND(L_fill_4_bytes);
1851    __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
1852    if (t == T_BYTE || t == T_SHORT) {
1853      __ delayed()->andcc(count, 1<<(shift-1), G0);
1854    } else {
1855      __ delayed()->nop();
1856    }
1857    __ stw(value, to, 0);
1858    if (t == T_BYTE || t == T_SHORT) {
1859      __ inc(to, 4);
1860      // fill trailing 2 bytes
1861      __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
1862      __ BIND(L_fill_2_bytes);
1863      __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
1864      __ delayed()->andcc(count, 1, count);
1865      __ sth(value, to, 0);
1866      if (t == T_BYTE) {
1867        __ inc(to, 2);
1868        // fill trailing byte
1869        __ andcc(count, 1, count);  // in delay slot of branches
1870        __ BIND(L_fill_byte);
1871        __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1872        __ delayed()->nop();
1873        __ stb(value, to, 0);
1874      } else {
1875        __ BIND(L_fill_byte);
1876      }
1877    } else {
1878      __ BIND(L_fill_2_bytes);
1879    }
1880    __ BIND(L_exit);
1881    __ retl();
1882    __ delayed()->nop();
1883
1884    // Handle copies less than 8 bytes.  Int is handled elsewhere.
1885    if (t == T_BYTE) {
1886      __ BIND(L_fill_elements);
1887      Label L_fill_2, L_fill_4;
1888      // in delay slot __ andcc(count, 1, G0);
1889      __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1890      __ delayed()->andcc(count, 2, G0);
1891      __ stb(value, to, 0);
1892      __ inc(to, 1);
1893      __ BIND(L_fill_2);
1894      __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
1895      __ delayed()->andcc(count, 4, G0);
1896      __ stb(value, to, 0);
1897      __ stb(value, to, 1);
1898      __ inc(to, 2);
1899      __ BIND(L_fill_4);
1900      __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1901      __ delayed()->nop();
1902      __ stb(value, to, 0);
1903      __ stb(value, to, 1);
1904      __ stb(value, to, 2);
1905      __ retl();
1906      __ delayed()->stb(value, to, 3);
1907    }
1908
1909    if (t == T_SHORT) {
1910      Label L_fill_2;
1911      __ BIND(L_fill_elements);
1912      // in delay slot __ andcc(count, 1, G0);
1913      __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1914      __ delayed()->andcc(count, 2, G0);
1915      __ sth(value, to, 0);
1916      __ inc(to, 2);
1917      __ BIND(L_fill_2);
1918      __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1919      __ delayed()->nop();
1920      __ sth(value, to, 0);
1921      __ retl();
1922      __ delayed()->sth(value, to, 2);
1923    }
1924    return start;
1925  }
1926
1927  //
1928  //  Generate stub for conjoint short copy.  If "aligned" is true, the
1929  //  "from" and "to" addresses are assumed to be heapword aligned.
1930  //
1931  // Arguments for generated stub:
1932  //      from:  O0
1933  //      to:    O1
1934  //      count: O2 treated as signed
1935  //
1936  address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1937                                       address *entry, const char *name) {
1938    // Do reverse copy.
1939
1940    __ align(CodeEntryAlignment);
1941    StubCodeMark mark(this, "StubRoutines", name);
1942    address start = __ pc();
1943
1944    Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
1945    Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1946
1947    const Register from      = O0;   // source array address
1948    const Register to        = O1;   // destination array address
1949    const Register count     = O2;   // elements count
1950    const Register end_from  = from; // source array end address
1951    const Register end_to    = to;   // destination array end address
1952
1953    const Register byte_count = O3;  // bytes count to copy
1954
1955    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1956
1957    if (entry != NULL) {
1958      *entry = __ pc();
1959      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1960      BLOCK_COMMENT("Entry:");
1961    }
1962
1963    array_overlap_test(nooverlap_target, 1);
1964
1965    __ sllx(count, LogBytesPerShort, byte_count);
1966    __ add(to, byte_count, end_to);  // offset after last copied element
1967
1968    // for short arrays, just do single element copy
1969    __ cmp(count, 11); // 8 + 3  (22 bytes)
1970    __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1971    __ delayed()->add(from, byte_count, end_from);
1972
1973    {
1974      // Align end of arrays since they could be not aligned even
1975      // when arrays itself are aligned.
1976
1977      // copy 1 element if necessary to align 'end_to' on an 4 bytes
1978      __ andcc(end_to, 3, G0);
1979      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1980      __ delayed()->lduh(end_from, -2, O3);
1981      __ dec(end_from, 2);
1982      __ dec(end_to, 2);
1983      __ dec(count);
1984      __ sth(O3, end_to, 0);
1985    __ BIND(L_skip_alignment);
1986
1987      // copy 2 elements to align 'end_to' on an 8 byte boundary
1988      __ andcc(end_to, 7, G0);
1989      __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1990      __ delayed()->lduh(end_from, -2, O3);
1991      __ dec(count, 2);
1992      __ lduh(end_from, -4, O4);
1993      __ dec(end_from, 4);
1994      __ dec(end_to, 4);
1995      __ sth(O3, end_to, 2);
1996      __ sth(O4, end_to, 0);
1997    __ BIND(L_skip_alignment2);
1998    }
1999#ifdef _LP64
2000    if (aligned) {
2001      // Both arrays are aligned to 8-bytes in 64-bits VM.
2002      // The 'count' is decremented in copy_16_bytes_backward_with_shift()
2003      // in unaligned case.
2004      __ dec(count, 8);
2005    } else
2006#endif
2007    {
2008      // Copy with shift 16 bytes per iteration if arrays do not have
2009      // the same alignment mod 8, otherwise jump to the next
2010      // code for aligned copy (and substracting 8 from 'count' before jump).
2011      // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
2012      // Also jump over aligned copy after the copy with shift completed.
2013
2014      copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
2015                                        L_aligned_copy, L_copy_2_bytes);
2016    }
2017    // copy 4 elements (16 bytes) at a time
2018      __ align(OptoLoopAlignment);
2019    __ BIND(L_aligned_copy);
2020      __ dec(end_from, 16);
2021      __ ldx(end_from, 8, O3);
2022      __ ldx(end_from, 0, O4);
2023      __ dec(end_to, 16);
2024      __ deccc(count, 8);
2025      __ stx(O3, end_to, 8);
2026      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2027      __ delayed()->stx(O4, end_to, 0);
2028      __ inc(count, 8);
2029
2030    // copy 1 element (2 bytes) at a time
2031    __ BIND(L_copy_2_bytes);
2032      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2033    __ BIND(L_copy_2_bytes_loop);
2034      __ dec(end_from, 2);
2035      __ dec(end_to, 2);
2036      __ lduh(end_from, 0, O4);
2037      __ deccc(count);
2038      __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
2039      __ delayed()->sth(O4, end_to, 0);
2040
2041    __ BIND(L_exit);
2042    // O3, O4 are used as temp registers
2043    inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
2044    __ retl();
2045    __ delayed()->mov(G0, O0); // return 0
2046    return start;
2047  }
2048
2049  //
2050  // Helper methods for generate_disjoint_int_copy_core()
2051  //
2052  void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
2053                          Label& L_loop, bool use_prefetch, bool use_bis) {
2054
2055    __ align(OptoLoopAlignment);
2056    __ BIND(L_loop);
2057    if (use_prefetch) {
2058      if (ArraycopySrcPrefetchDistance > 0) {
2059        __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
2060      }
2061      if (ArraycopyDstPrefetchDistance > 0) {
2062        __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
2063      }
2064    }
2065    __ ldx(from, 4, O4);
2066    __ ldx(from, 12, G4);
2067    __ inc(to, 16);
2068    __ inc(from, 16);
2069    __ deccc(count, 4); // Can we do next iteration after this one?
2070
2071    __ srlx(O4, 32, G3);
2072    __ bset(G3, O3);
2073    __ sllx(O4, 32, O4);
2074    __ srlx(G4, 32, G3);
2075    __ bset(G3, O4);
2076    if (use_bis) {
2077      __ stxa(O3, to, -16);
2078      __ stxa(O4, to, -8);
2079    } else {
2080      __ stx(O3, to, -16);
2081      __ stx(O4, to, -8);
2082    }
2083    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2084    __ delayed()->sllx(G4, 32,  O3);
2085
2086  }
2087
2088  //
2089  //  Generate core code for disjoint int copy (and oop copy on 32-bit).
2090  //  If "aligned" is true, the "from" and "to" addresses are assumed
2091  //  to be heapword aligned.
2092  //
2093  // Arguments:
2094  //      from:  O0
2095  //      to:    O1
2096  //      count: O2 treated as signed
2097  //
2098  void generate_disjoint_int_copy_core(bool aligned) {
2099
2100    Label L_skip_alignment, L_aligned_copy;
2101    Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2102
2103    const Register from      = O0;   // source array address
2104    const Register to        = O1;   // destination array address
2105    const Register count     = O2;   // elements count
2106    const Register offset    = O5;   // offset from start of arrays
2107    // O3, O4, G3, G4 are used as temp registers
2108
2109    // 'aligned' == true when it is known statically during compilation
2110    // of this arraycopy call site that both 'from' and 'to' addresses
2111    // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
2112    //
2113    // Aligned arrays have 4 bytes alignment in 32-bits VM
2114    // and 8 bytes - in 64-bits VM.
2115    //
2116#ifdef _LP64
2117    if (!aligned)
2118#endif
2119    {
2120      // The next check could be put under 'ifndef' since the code in
2121      // generate_disjoint_long_copy_core() has own checks and set 'offset'.
2122
2123      // for short arrays, just do single element copy
2124      __ cmp(count, 5); // 4 + 1 (20 bytes)
2125      __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2126      __ delayed()->mov(G0, offset);
2127
2128      // copy 1 element to align 'to' on an 8 byte boundary
2129      __ andcc(to, 7, G0);
2130      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2131      __ delayed()->ld(from, 0, O3);
2132      __ inc(from, 4);
2133      __ inc(to, 4);
2134      __ dec(count);
2135      __ st(O3, to, -4);
2136    __ BIND(L_skip_alignment);
2137
2138    // if arrays have same alignment mod 8, do 4 elements copy
2139      __ andcc(from, 7, G0);
2140      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2141      __ delayed()->ld(from, 0, O3);
2142
2143    //
2144    // Load 2 aligned 8-bytes chunks and use one from previous iteration
2145    // to form 2 aligned 8-bytes chunks to store.
2146    //
2147    // copy_16_bytes_forward_with_shift() is not used here since this
2148    // code is more optimal.
2149
2150    // copy with shift 4 elements (16 bytes) at a time
2151      __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
2152      __ sllx(O3, 32,  O3);
2153
2154      disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop);
2155
2156      __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2157      __ delayed()->inc(count, 4); // restore 'count'
2158
2159    __ BIND(L_aligned_copy);
2160    } // !aligned
2161
2162    // copy 4 elements (16 bytes) at a time
2163      __ and3(count, 1, G4); // Save
2164      __ srl(count, 1, count);
2165     generate_disjoint_long_copy_core(aligned);
2166      __ mov(G4, count);     // Restore
2167
2168    // copy 1 element at a time
2169    __ BIND(L_copy_4_bytes);
2170      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2171    __ BIND(L_copy_4_bytes_loop);
2172      __ ld(from, offset, O3);
2173      __ deccc(count);
2174      __ st(O3, to, offset);
2175      __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
2176      __ delayed()->inc(offset, 4);
2177    __ BIND(L_exit);
2178  }
2179
2180  //
2181  //  Generate stub for disjoint int copy.  If "aligned" is true, the
2182  //  "from" and "to" addresses are assumed to be heapword aligned.
2183  //
2184  // Arguments for generated stub:
2185  //      from:  O0
2186  //      to:    O1
2187  //      count: O2 treated as signed
2188  //
2189  address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
2190    __ align(CodeEntryAlignment);
2191    StubCodeMark mark(this, "StubRoutines", name);
2192    address start = __ pc();
2193
2194    const Register count = O2;
2195    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2196
2197    if (entry != NULL) {
2198      *entry = __ pc();
2199      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2200      BLOCK_COMMENT("Entry:");
2201    }
2202
2203    generate_disjoint_int_copy_core(aligned);
2204
2205    // O3, O4 are used as temp registers
2206    inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2207    __ retl();
2208    __ delayed()->mov(G0, O0); // return 0
2209    return start;
2210  }
2211
2212  //
2213  //  Generate core code for conjoint int copy (and oop copy on 32-bit).
2214  //  If "aligned" is true, the "from" and "to" addresses are assumed
2215  //  to be heapword aligned.
2216  //
2217  // Arguments:
2218  //      from:  O0
2219  //      to:    O1
2220  //      count: O2 treated as signed
2221  //
2222  void generate_conjoint_int_copy_core(bool aligned) {
2223    // Do reverse copy.
2224
2225    Label L_skip_alignment, L_aligned_copy;
2226    Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2227
2228    const Register from      = O0;   // source array address
2229    const Register to        = O1;   // destination array address
2230    const Register count     = O2;   // elements count
2231    const Register end_from  = from; // source array end address
2232    const Register end_to    = to;   // destination array end address
2233    // O3, O4, O5, G3 are used as temp registers
2234
2235    const Register byte_count = O3;  // bytes count to copy
2236
2237      __ sllx(count, LogBytesPerInt, byte_count);
2238      __ add(to, byte_count, end_to); // offset after last copied element
2239
2240      __ cmp(count, 5); // for short arrays, just do single element copy
2241      __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2242      __ delayed()->add(from, byte_count, end_from);
2243
2244    // copy 1 element to align 'to' on an 8 byte boundary
2245      __ andcc(end_to, 7, G0);
2246      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2247      __ delayed()->nop();
2248      __ dec(count);
2249      __ dec(end_from, 4);
2250      __ dec(end_to,   4);
2251      __ ld(end_from, 0, O4);
2252      __ st(O4, end_to, 0);
2253    __ BIND(L_skip_alignment);
2254
2255    // Check if 'end_from' and 'end_to' has the same alignment.
2256      __ andcc(end_from, 7, G0);
2257      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2258      __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
2259
2260    // copy with shift 4 elements (16 bytes) at a time
2261    //
2262    // Load 2 aligned 8-bytes chunks and use one from previous iteration
2263    // to form 2 aligned 8-bytes chunks to store.
2264    //
2265      __ ldx(end_from, -4, O3);
2266      __ align(OptoLoopAlignment);
2267    __ BIND(L_copy_16_bytes);
2268      __ ldx(end_from, -12, O4);
2269      __ deccc(count, 4);
2270      __ ldx(end_from, -20, O5);
2271      __ dec(end_to, 16);
2272      __ dec(end_from, 16);
2273      __ srlx(O3, 32, O3);
2274      __ sllx(O4, 32, G3);
2275      __ bset(G3, O3);
2276      __ stx(O3, end_to, 8);
2277      __ srlx(O4, 32, O4);
2278      __ sllx(O5, 32, G3);
2279      __ bset(O4, G3);
2280      __ stx(G3, end_to, 0);
2281      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2282      __ delayed()->mov(O5, O3);
2283
2284      __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2285      __ delayed()->inc(count, 4);
2286
2287    // copy 4 elements (16 bytes) at a time
2288      __ align(OptoLoopAlignment);
2289    __ BIND(L_aligned_copy);
2290      __ dec(end_from, 16);
2291      __ ldx(end_from, 8, O3);
2292      __ ldx(end_from, 0, O4);
2293      __ dec(end_to, 16);
2294      __ deccc(count, 4);
2295      __ stx(O3, end_to, 8);
2296      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2297      __ delayed()->stx(O4, end_to, 0);
2298      __ inc(count, 4);
2299
2300    // copy 1 element (4 bytes) at a time
2301    __ BIND(L_copy_4_bytes);
2302      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2303    __ BIND(L_copy_4_bytes_loop);
2304      __ dec(end_from, 4);
2305      __ dec(end_to, 4);
2306      __ ld(end_from, 0, O4);
2307      __ deccc(count);
2308      __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
2309      __ delayed()->st(O4, end_to, 0);
2310    __ BIND(L_exit);
2311  }
2312
2313  //
2314  //  Generate stub for conjoint int copy.  If "aligned" is true, the
2315  //  "from" and "to" addresses are assumed to be heapword aligned.
2316  //
2317  // Arguments for generated stub:
2318  //      from:  O0
2319  //      to:    O1
2320  //      count: O2 treated as signed
2321  //
2322  address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
2323                                     address *entry, const char *name) {
2324    __ align(CodeEntryAlignment);
2325    StubCodeMark mark(this, "StubRoutines", name);
2326    address start = __ pc();
2327
2328    assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2329
2330    if (entry != NULL) {
2331      *entry = __ pc();
2332      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2333      BLOCK_COMMENT("Entry:");
2334    }
2335
2336    array_overlap_test(nooverlap_target, 2);
2337
2338    generate_conjoint_int_copy_core(aligned);
2339
2340    // O3, O4 are used as temp registers
2341    inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2342    __ retl();
2343    __ delayed()->mov(G0, O0); // return 0
2344    return start;
2345  }
2346
2347  //
2348  // Helper methods for generate_disjoint_long_copy_core()
2349  //
2350  void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
2351                          Label& L_loop, bool use_prefetch, bool use_bis) {
2352    __ align(OptoLoopAlignment);
2353    __ BIND(L_loop);
2354    for (int off = 0; off < 64; off += 16) {
2355      if (use_prefetch && (off & 31) == 0) {
2356        if (ArraycopySrcPrefetchDistance > 0) {
2357          __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads);
2358        }
2359        if (ArraycopyDstPrefetchDistance > 0) {
2360          __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads);
2361        }
2362      }
2363      __ ldx(from,  off+0, O4);
2364      __ ldx(from,  off+8, O5);
2365      if (use_bis) {
2366        __ stxa(O4, to,  off+0);
2367        __ stxa(O5, to,  off+8);
2368      } else {
2369        __ stx(O4, to,  off+0);
2370        __ stx(O5, to,  off+8);
2371      }
2372    }
2373    __ deccc(count, 8);
2374    __ inc(from, 64);
2375    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2376    __ delayed()->inc(to, 64);
2377  }
2378
2379  //
2380  //  Generate core code for disjoint long copy (and oop copy on 64-bit).
2381  //  "aligned" is ignored, because we must make the stronger
2382  //  assumption that both addresses are always 64-bit aligned.
2383  //
2384  // Arguments:
2385  //      from:  O0
2386  //      to:    O1
2387  //      count: O2 treated as signed
2388  //
2389  // count -= 2;
2390  // if ( count >= 0 ) { // >= 2 elements
2391  //   if ( count > 6) { // >= 8 elements
2392  //     count -= 6; // original count - 8
2393  //     do {
2394  //       copy_8_elements;
2395  //       count -= 8;
2396  //     } while ( count >= 0 );
2397  //     count += 6;
2398  //   }
2399  //   if ( count >= 0 ) { // >= 2 elements
2400  //     do {
2401  //       copy_2_elements;
2402  //     } while ( (count=count-2) >= 0 );
2403  //   }
2404  // }
2405  // count += 2;
2406  // if ( count != 0 ) { // 1 element left
2407  //   copy_1_element;
2408  // }
2409  //
2410  void generate_disjoint_long_copy_core(bool aligned) {
2411    Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2412    const Register from    = O0;  // source array address
2413    const Register to      = O1;  // destination array address
2414    const Register count   = O2;  // elements count
2415    const Register offset0 = O4;  // element offset
2416    const Register offset8 = O5;  // next element offset
2417
2418    __ deccc(count, 2);
2419    __ mov(G0, offset0);   // offset from start of arrays (0)
2420    __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2421    __ delayed()->add(offset0, 8, offset8);
2422
2423    // Copy by 64 bytes chunks
2424
2425    const Register from64 = O3;  // source address
2426    const Register to64   = G3;  // destination address
2427    __ subcc(count, 6, O3);
2428    __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2429    __ delayed()->mov(to,   to64);
2430    // Now we can use O4(offset0), O5(offset8) as temps
2431    __ mov(O3, count);
2432    // count >= 0 (original count - 8)
2433    __ mov(from, from64);
2434
2435    disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop);
2436
2437      // Restore O4(offset0), O5(offset8)
2438      __ sub(from64, from, offset0);
2439      __ inccc(count, 6); // restore count
2440      __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2441      __ delayed()->add(offset0, 8, offset8);
2442
2443      // Copy by 16 bytes chunks
2444      __ align(OptoLoopAlignment);
2445    __ BIND(L_copy_16_bytes);
2446      __ ldx(from, offset0, O3);
2447      __ ldx(from, offset8, G3);
2448      __ deccc(count, 2);
2449      __ stx(O3, to, offset0);
2450      __ inc(offset0, 16);
2451      __ stx(G3, to, offset8);
2452      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2453      __ delayed()->inc(offset8, 16);
2454
2455      // Copy last 8 bytes
2456    __ BIND(L_copy_8_bytes);
2457      __ inccc(count, 2);
2458      __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2459      __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
2460      __ ldx(from, offset0, O3);
2461      __ stx(O3, to, offset0);
2462    __ BIND(L_exit);
2463  }
2464
2465  //
2466  //  Generate stub for disjoint long copy.
2467  //  "aligned" is ignored, because we must make the stronger
2468  //  assumption that both addresses are always 64-bit aligned.
2469  //
2470  // Arguments for generated stub:
2471  //      from:  O0
2472  //      to:    O1
2473  //      count: O2 treated as signed
2474  //
2475  address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
2476    __ align(CodeEntryAlignment);
2477    StubCodeMark mark(this, "StubRoutines", name);
2478    address start = __ pc();
2479
2480    assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2481
2482    if (entry != NULL) {
2483      *entry = __ pc();
2484      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2485      BLOCK_COMMENT("Entry:");
2486    }
2487
2488    generate_disjoint_long_copy_core(aligned);
2489
2490    // O3, O4 are used as temp registers
2491    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2492    __ retl();
2493    __ delayed()->mov(G0, O0); // return 0
2494    return start;
2495  }
2496
2497  //
2498  //  Generate core code for conjoint long copy (and oop copy on 64-bit).
2499  //  "aligned" is ignored, because we must make the stronger
2500  //  assumption that both addresses are always 64-bit aligned.
2501  //
2502  // Arguments:
2503  //      from:  O0
2504  //      to:    O1
2505  //      count: O2 treated as signed
2506  //
2507  void generate_conjoint_long_copy_core(bool aligned) {
2508    // Do reverse copy.
2509    Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2510    const Register from    = O0;  // source array address
2511    const Register to      = O1;  // destination array address
2512    const Register count   = O2;  // elements count
2513    const Register offset8 = O4;  // element offset
2514    const Register offset0 = O5;  // previous element offset
2515
2516      __ subcc(count, 1, count);
2517      __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
2518      __ delayed()->sllx(count, LogBytesPerLong, offset8);
2519      __ sub(offset8, 8, offset0);
2520      __ align(OptoLoopAlignment);
2521    __ BIND(L_copy_16_bytes);
2522      __ ldx(from, offset8, O2);
2523      __ ldx(from, offset0, O3);
2524      __ stx(O2, to, offset8);
2525      __ deccc(offset8, 16);      // use offset8 as counter
2526      __ stx(O3, to, offset0);
2527      __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
2528      __ delayed()->dec(offset0, 16);
2529
2530    __ BIND(L_copy_8_bytes);
2531      __ brx(Assembler::negative, false, Assembler::pn, L_exit );
2532      __ delayed()->nop();
2533      __ ldx(from, 0, O3);
2534      __ stx(O3, to, 0);
2535    __ BIND(L_exit);
2536  }
2537
2538  //  Generate stub for conjoint long copy.
2539  //  "aligned" is ignored, because we must make the stronger
2540  //  assumption that both addresses are always 64-bit aligned.
2541  //
2542  // Arguments for generated stub:
2543  //      from:  O0
2544  //      to:    O1
2545  //      count: O2 treated as signed
2546  //
2547  address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
2548                                      address *entry, const char *name) {
2549    __ align(CodeEntryAlignment);
2550    StubCodeMark mark(this, "StubRoutines", name);
2551    address start = __ pc();
2552
2553    assert(aligned, "Should always be aligned");
2554
2555    assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2556
2557    if (entry != NULL) {
2558      *entry = __ pc();
2559      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2560      BLOCK_COMMENT("Entry:");
2561    }
2562
2563    array_overlap_test(nooverlap_target, 3);
2564
2565    generate_conjoint_long_copy_core(aligned);
2566
2567    // O3, O4 are used as temp registers
2568    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2569    __ retl();
2570    __ delayed()->mov(G0, O0); // return 0
2571    return start;
2572  }
2573
2574  //  Generate stub for disjoint oop copy.  If "aligned" is true, the
2575  //  "from" and "to" addresses are assumed to be heapword aligned.
2576  //
2577  // Arguments for generated stub:
2578  //      from:  O0
2579  //      to:    O1
2580  //      count: O2 treated as signed
2581  //
2582  address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
2583                                     bool dest_uninitialized = false) {
2584
2585    const Register from  = O0;  // source array address
2586    const Register to    = O1;  // destination array address
2587    const Register count = O2;  // elements count
2588
2589    __ align(CodeEntryAlignment);
2590    StubCodeMark mark(this, "StubRoutines", name);
2591    address start = __ pc();
2592
2593    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2594
2595    if (entry != NULL) {
2596      *entry = __ pc();
2597      // caller can pass a 64-bit byte count here
2598      BLOCK_COMMENT("Entry:");
2599    }
2600
2601    // save arguments for barrier generation
2602    __ mov(to, G1);
2603    __ mov(count, G5);
2604    gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2605  #ifdef _LP64
2606    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2607    if (UseCompressedOops) {
2608      generate_disjoint_int_copy_core(aligned);
2609    } else {
2610      generate_disjoint_long_copy_core(aligned);
2611    }
2612  #else
2613    generate_disjoint_int_copy_core(aligned);
2614  #endif
2615    // O0 is used as temp register
2616    gen_write_ref_array_post_barrier(G1, G5, O0);
2617
2618    // O3, O4 are used as temp registers
2619    inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2620    __ retl();
2621    __ delayed()->mov(G0, O0); // return 0
2622    return start;
2623  }
2624
2625  //  Generate stub for conjoint oop copy.  If "aligned" is true, the
2626  //  "from" and "to" addresses are assumed to be heapword aligned.
2627  //
2628  // Arguments for generated stub:
2629  //      from:  O0
2630  //      to:    O1
2631  //      count: O2 treated as signed
2632  //
2633  address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
2634                                     address *entry, const char *name,
2635                                     bool dest_uninitialized = false) {
2636
2637    const Register from  = O0;  // source array address
2638    const Register to    = O1;  // destination array address
2639    const Register count = O2;  // elements count
2640
2641    __ align(CodeEntryAlignment);
2642    StubCodeMark mark(this, "StubRoutines", name);
2643    address start = __ pc();
2644
2645    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2646
2647    if (entry != NULL) {
2648      *entry = __ pc();
2649      // caller can pass a 64-bit byte count here
2650      BLOCK_COMMENT("Entry:");
2651    }
2652
2653    array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
2654
2655    // save arguments for barrier generation
2656    __ mov(to, G1);
2657    __ mov(count, G5);
2658    gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2659
2660  #ifdef _LP64
2661    if (UseCompressedOops) {
2662      generate_conjoint_int_copy_core(aligned);
2663    } else {
2664      generate_conjoint_long_copy_core(aligned);
2665    }
2666  #else
2667    generate_conjoint_int_copy_core(aligned);
2668  #endif
2669
2670    // O0 is used as temp register
2671    gen_write_ref_array_post_barrier(G1, G5, O0);
2672
2673    // O3, O4 are used as temp registers
2674    inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2675    __ retl();
2676    __ delayed()->mov(G0, O0); // return 0
2677    return start;
2678  }
2679
2680
2681  // Helper for generating a dynamic type check.
2682  // Smashes only the given temp registers.
2683  void generate_type_check(Register sub_klass,
2684                           Register super_check_offset,
2685                           Register super_klass,
2686                           Register temp,
2687                           Label& L_success) {
2688    assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
2689
2690    BLOCK_COMMENT("type_check:");
2691
2692    Label L_miss, L_pop_to_miss;
2693
2694    assert_clean_int(super_check_offset, temp);
2695
2696    __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
2697                                     &L_success, &L_miss, NULL,
2698                                     super_check_offset);
2699
2700    BLOCK_COMMENT("type_check_slow_path:");
2701    __ save_frame(0);
2702    __ check_klass_subtype_slow_path(sub_klass->after_save(),
2703                                     super_klass->after_save(),
2704                                     L0, L1, L2, L4,
2705                                     NULL, &L_pop_to_miss);
2706    __ ba(L_success);
2707    __ delayed()->restore();
2708
2709    __ bind(L_pop_to_miss);
2710    __ restore();
2711
2712    // Fall through on failure!
2713    __ BIND(L_miss);
2714  }
2715
2716
2717  //  Generate stub for checked oop copy.
2718  //
2719  // Arguments for generated stub:
2720  //      from:  O0
2721  //      to:    O1
2722  //      count: O2 treated as signed
2723  //      ckoff: O3 (super_check_offset)
2724  //      ckval: O4 (super_klass)
2725  //      ret:   O0 zero for success; (-1^K) where K is partial transfer count
2726  //
2727  address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
2728
2729    const Register O0_from   = O0;      // source array address
2730    const Register O1_to     = O1;      // destination array address
2731    const Register O2_count  = O2;      // elements count
2732    const Register O3_ckoff  = O3;      // super_check_offset
2733    const Register O4_ckval  = O4;      // super_klass
2734
2735    const Register O5_offset = O5;      // loop var, with stride wordSize
2736    const Register G1_remain = G1;      // loop var, with stride -1
2737    const Register G3_oop    = G3;      // actual oop copied
2738    const Register G4_klass  = G4;      // oop._klass
2739    const Register G5_super  = G5;      // oop._klass._primary_supers[ckval]
2740
2741    __ align(CodeEntryAlignment);
2742    StubCodeMark mark(this, "StubRoutines", name);
2743    address start = __ pc();
2744
2745#ifdef ASSERT
2746    // We sometimes save a frame (see generate_type_check below).
2747    // If this will cause trouble, let's fail now instead of later.
2748    __ save_frame(0);
2749    __ restore();
2750#endif
2751
2752    assert_clean_int(O2_count, G1);     // Make sure 'count' is clean int.
2753
2754#ifdef ASSERT
2755    // caller guarantees that the arrays really are different
2756    // otherwise, we would have to make conjoint checks
2757    { Label L;
2758      __ mov(O3, G1);           // spill: overlap test smashes O3
2759      __ mov(O4, G4);           // spill: overlap test smashes O4
2760      array_overlap_test(L, LogBytesPerHeapOop);
2761      __ stop("checkcast_copy within a single array");
2762      __ bind(L);
2763      __ mov(G1, O3);
2764      __ mov(G4, O4);
2765    }
2766#endif //ASSERT
2767
2768    if (entry != NULL) {
2769      *entry = __ pc();
2770      // caller can pass a 64-bit byte count here (from generic stub)
2771      BLOCK_COMMENT("Entry:");
2772    }
2773    gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized);
2774
2775    Label load_element, store_element, do_card_marks, fail, done;
2776    __ addcc(O2_count, 0, G1_remain);   // initialize loop index, and test it
2777    __ brx(Assembler::notZero, false, Assembler::pt, load_element);
2778    __ delayed()->mov(G0, O5_offset);   // offset from start of arrays
2779
2780    // Empty array:  Nothing to do.
2781    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2782    __ retl();
2783    __ delayed()->set(0, O0);           // return 0 on (trivial) success
2784
2785    // ======== begin loop ========
2786    // (Loop is rotated; its entry is load_element.)
2787    // Loop variables:
2788    //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
2789    //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
2790    //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
2791    __ align(OptoLoopAlignment);
2792
2793    __ BIND(store_element);
2794    __ deccc(G1_remain);                // decrement the count
2795    __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
2796    __ inc(O5_offset, heapOopSize);     // step to next offset
2797    __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
2798    __ delayed()->set(0, O0);           // return -1 on success
2799
2800    // ======== loop entry is here ========
2801    __ BIND(load_element);
2802    __ load_heap_oop(O0_from, O5_offset, G3_oop);  // load the oop
2803    __ br_null_short(G3_oop, Assembler::pt, store_element);
2804
2805    __ load_klass(G3_oop, G4_klass); // query the object klass
2806
2807    generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
2808                        // branch to this on success:
2809                        store_element);
2810    // ======== end loop ========
2811
2812    // It was a real error; we must depend on the caller to finish the job.
2813    // Register G1 has number of *remaining* oops, O2 number of *total* oops.
2814    // Emit GC store barriers for the oops we have copied (O2 minus G1),
2815    // and report their number to the caller.
2816    __ BIND(fail);
2817    __ subcc(O2_count, G1_remain, O2_count);
2818    __ brx(Assembler::zero, false, Assembler::pt, done);
2819    __ delayed()->not1(O2_count, O0);   // report (-1^K) to caller
2820
2821    __ BIND(do_card_marks);
2822    gen_write_ref_array_post_barrier(O1_to, O2_count, O3);   // store check on O1[0..O2]
2823
2824    __ BIND(done);
2825    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2826    __ retl();
2827    __ delayed()->nop();             // return value in 00
2828
2829    return start;
2830  }
2831
2832
2833  //  Generate 'unsafe' array copy stub
2834  //  Though just as safe as the other stubs, it takes an unscaled
2835  //  size_t argument instead of an element count.
2836  //
2837  // Arguments for generated stub:
2838  //      from:  O0
2839  //      to:    O1
2840  //      count: O2 byte count, treated as ssize_t, can be zero
2841  //
2842  // Examines the alignment of the operands and dispatches
2843  // to a long, int, short, or byte copy loop.
2844  //
2845  address generate_unsafe_copy(const char* name,
2846                               address byte_copy_entry,
2847                               address short_copy_entry,
2848                               address int_copy_entry,
2849                               address long_copy_entry) {
2850
2851    const Register O0_from   = O0;      // source array address
2852    const Register O1_to     = O1;      // destination array address
2853    const Register O2_count  = O2;      // elements count
2854
2855    const Register G1_bits   = G1;      // test copy of low bits
2856
2857    __ align(CodeEntryAlignment);
2858    StubCodeMark mark(this, "StubRoutines", name);
2859    address start = __ pc();
2860
2861    // bump this on entry, not on exit:
2862    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
2863
2864    __ or3(O0_from, O1_to, G1_bits);
2865    __ or3(O2_count,       G1_bits, G1_bits);
2866
2867    __ btst(BytesPerLong-1, G1_bits);
2868    __ br(Assembler::zero, true, Assembler::pt,
2869          long_copy_entry, relocInfo::runtime_call_type);
2870    // scale the count on the way out:
2871    __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
2872
2873    __ btst(BytesPerInt-1, G1_bits);
2874    __ br(Assembler::zero, true, Assembler::pt,
2875          int_copy_entry, relocInfo::runtime_call_type);
2876    // scale the count on the way out:
2877    __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
2878
2879    __ btst(BytesPerShort-1, G1_bits);
2880    __ br(Assembler::zero, true, Assembler::pt,
2881          short_copy_entry, relocInfo::runtime_call_type);
2882    // scale the count on the way out:
2883    __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
2884
2885    __ br(Assembler::always, false, Assembler::pt,
2886          byte_copy_entry, relocInfo::runtime_call_type);
2887    __ delayed()->nop();
2888
2889    return start;
2890  }
2891
2892
2893  // Perform range checks on the proposed arraycopy.
2894  // Kills the two temps, but nothing else.
2895  // Also, clean the sign bits of src_pos and dst_pos.
2896  void arraycopy_range_checks(Register src,     // source array oop (O0)
2897                              Register src_pos, // source position (O1)
2898                              Register dst,     // destination array oo (O2)
2899                              Register dst_pos, // destination position (O3)
2900                              Register length,  // length of copy (O4)
2901                              Register temp1, Register temp2,
2902                              Label& L_failed) {
2903    BLOCK_COMMENT("arraycopy_range_checks:");
2904
2905    //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2906
2907    const Register array_length = temp1;  // scratch
2908    const Register end_pos      = temp2;  // scratch
2909
2910    // Note:  This next instruction may be in the delay slot of a branch:
2911    __ add(length, src_pos, end_pos);  // src_pos + length
2912    __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
2913    __ cmp(end_pos, array_length);
2914    __ br(Assembler::greater, false, Assembler::pn, L_failed);
2915
2916    //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2917    __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
2918    __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
2919    __ cmp(end_pos, array_length);
2920    __ br(Assembler::greater, false, Assembler::pn, L_failed);
2921
2922    // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2923    // Move with sign extension can be used since they are positive.
2924    __ delayed()->signx(src_pos, src_pos);
2925    __ signx(dst_pos, dst_pos);
2926
2927    BLOCK_COMMENT("arraycopy_range_checks done");
2928  }
2929
2930
2931  //
2932  //  Generate generic array copy stubs
2933  //
2934  //  Input:
2935  //    O0    -  src oop
2936  //    O1    -  src_pos
2937  //    O2    -  dst oop
2938  //    O3    -  dst_pos
2939  //    O4    -  element count
2940  //
2941  //  Output:
2942  //    O0 ==  0  -  success
2943  //    O0 == -1  -  need to call System.arraycopy
2944  //
2945  address generate_generic_copy(const char *name,
2946                                address entry_jbyte_arraycopy,
2947                                address entry_jshort_arraycopy,
2948                                address entry_jint_arraycopy,
2949                                address entry_oop_arraycopy,
2950                                address entry_jlong_arraycopy,
2951                                address entry_checkcast_arraycopy) {
2952    Label L_failed, L_objArray;
2953
2954    // Input registers
2955    const Register src      = O0;  // source array oop
2956    const Register src_pos  = O1;  // source position
2957    const Register dst      = O2;  // destination array oop
2958    const Register dst_pos  = O3;  // destination position
2959    const Register length   = O4;  // elements count
2960
2961    // registers used as temp
2962    const Register G3_src_klass = G3; // source array klass
2963    const Register G4_dst_klass = G4; // destination array klass
2964    const Register G5_lh        = G5; // layout handler
2965    const Register O5_temp      = O5;
2966
2967    __ align(CodeEntryAlignment);
2968    StubCodeMark mark(this, "StubRoutines", name);
2969    address start = __ pc();
2970
2971    // bump this on entry, not on exit:
2972    inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
2973
2974    // In principle, the int arguments could be dirty.
2975    //assert_clean_int(src_pos, G1);
2976    //assert_clean_int(dst_pos, G1);
2977    //assert_clean_int(length, G1);
2978
2979    //-----------------------------------------------------------------------
2980    // Assembler stubs will be used for this call to arraycopy
2981    // if the following conditions are met:
2982    //
2983    // (1) src and dst must not be null.
2984    // (2) src_pos must not be negative.
2985    // (3) dst_pos must not be negative.
2986    // (4) length  must not be negative.
2987    // (5) src klass and dst klass should be the same and not NULL.
2988    // (6) src and dst should be arrays.
2989    // (7) src_pos + length must not exceed length of src.
2990    // (8) dst_pos + length must not exceed length of dst.
2991    BLOCK_COMMENT("arraycopy initial argument checks");
2992
2993    //  if (src == NULL) return -1;
2994    __ br_null(src, false, Assembler::pn, L_failed);
2995
2996    //  if (src_pos < 0) return -1;
2997    __ delayed()->tst(src_pos);
2998    __ br(Assembler::negative, false, Assembler::pn, L_failed);
2999    __ delayed()->nop();
3000
3001    //  if (dst == NULL) return -1;
3002    __ br_null(dst, false, Assembler::pn, L_failed);
3003
3004    //  if (dst_pos < 0) return -1;
3005    __ delayed()->tst(dst_pos);
3006    __ br(Assembler::negative, false, Assembler::pn, L_failed);
3007
3008    //  if (length < 0) return -1;
3009    __ delayed()->tst(length);
3010    __ br(Assembler::negative, false, Assembler::pn, L_failed);
3011
3012    BLOCK_COMMENT("arraycopy argument klass checks");
3013    //  get src->klass()
3014    if (UseCompressedKlassPointers) {
3015      __ delayed()->nop(); // ??? not good
3016      __ load_klass(src, G3_src_klass);
3017    } else {
3018      __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
3019    }
3020
3021#ifdef ASSERT
3022    //  assert(src->klass() != NULL);
3023    BLOCK_COMMENT("assert klasses not null");
3024    { Label L_a, L_b;
3025      __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL
3026      __ bind(L_a);
3027      __ stop("broken null klass");
3028      __ bind(L_b);
3029      __ load_klass(dst, G4_dst_klass);
3030      __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
3031      __ delayed()->mov(G0, G4_dst_klass);      // scribble the temp
3032      BLOCK_COMMENT("assert done");
3033    }
3034#endif
3035
3036    // Load layout helper
3037    //
3038    //  |array_tag|     | header_size | element_type |     |log2_element_size|
3039    // 32        30    24            16              8     2                 0
3040    //
3041    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3042    //
3043
3044    int lh_offset = in_bytes(Klass::layout_helper_offset());
3045
3046    // Load 32-bits signed value. Use br() instruction with it to check icc.
3047    __ lduw(G3_src_klass, lh_offset, G5_lh);
3048
3049    if (UseCompressedKlassPointers) {
3050      __ load_klass(dst, G4_dst_klass);
3051    }
3052    // Handle objArrays completely differently...
3053    juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3054    __ set(objArray_lh, O5_temp);
3055    __ cmp(G5_lh,       O5_temp);
3056    __ br(Assembler::equal, false, Assembler::pt, L_objArray);
3057    if (UseCompressedKlassPointers) {
3058      __ delayed()->nop();
3059    } else {
3060      __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
3061    }
3062
3063    //  if (src->klass() != dst->klass()) return -1;
3064    __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed);
3065
3066    //  if (!src->is_Array()) return -1;
3067    __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
3068    __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
3069
3070    // At this point, it is known to be a typeArray (array_tag 0x3).
3071#ifdef ASSERT
3072    __ delayed()->nop();
3073    { Label L;
3074      jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
3075      __ set(lh_prim_tag_in_place, O5_temp);
3076      __ cmp(G5_lh,                O5_temp);
3077      __ br(Assembler::greaterEqual, false, Assembler::pt, L);
3078      __ delayed()->nop();
3079      __ stop("must be a primitive array");
3080      __ bind(L);
3081    }
3082#else
3083    __ delayed();                               // match next insn to prev branch
3084#endif
3085
3086    arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3087                           O5_temp, G4_dst_klass, L_failed);
3088
3089    // TypeArrayKlass
3090    //
3091    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3092    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3093    //
3094
3095    const Register G4_offset = G4_dst_klass;    // array offset
3096    const Register G3_elsize = G3_src_klass;    // log2 element size
3097
3098    __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
3099    __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
3100    __ add(src, G4_offset, src);       // src array offset
3101    __ add(dst, G4_offset, dst);       // dst array offset
3102    __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
3103
3104    // next registers should be set before the jump to corresponding stub
3105    const Register from     = O0;  // source array address
3106    const Register to       = O1;  // destination array address
3107    const Register count    = O2;  // elements count
3108
3109    // 'from', 'to', 'count' registers should be set in this order
3110    // since they are the same as 'src', 'src_pos', 'dst'.
3111
3112    BLOCK_COMMENT("scale indexes to element size");
3113    __ sll_ptr(src_pos, G3_elsize, src_pos);
3114    __ sll_ptr(dst_pos, G3_elsize, dst_pos);
3115    __ add(src, src_pos, from);       // src_addr
3116    __ add(dst, dst_pos, to);         // dst_addr
3117
3118    BLOCK_COMMENT("choose copy loop based on element size");
3119    __ cmp(G3_elsize, 0);
3120    __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
3121    __ delayed()->signx(length, count); // length
3122
3123    __ cmp(G3_elsize, LogBytesPerShort);
3124    __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
3125    __ delayed()->signx(length, count); // length
3126
3127    __ cmp(G3_elsize, LogBytesPerInt);
3128    __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
3129    __ delayed()->signx(length, count); // length
3130#ifdef ASSERT
3131    { Label L;
3132      __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L);
3133      __ stop("must be long copy, but elsize is wrong");
3134      __ bind(L);
3135    }
3136#endif
3137    __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
3138    __ delayed()->signx(length, count); // length
3139
3140    // ObjArrayKlass
3141  __ BIND(L_objArray);
3142    // live at this point:  G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
3143
3144    Label L_plain_copy, L_checkcast_copy;
3145    //  test array classes for subtyping
3146    __ cmp(G3_src_klass, G4_dst_klass);         // usual case is exact equality
3147    __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
3148    __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
3149
3150    // Identically typed arrays can be copied without element-wise checks.
3151    arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3152                           O5_temp, G5_lh, L_failed);
3153
3154    __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3155    __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3156    __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3157    __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3158    __ add(src, src_pos, from);       // src_addr
3159    __ add(dst, dst_pos, to);         // dst_addr
3160  __ BIND(L_plain_copy);
3161    __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
3162    __ delayed()->signx(length, count); // length
3163
3164  __ BIND(L_checkcast_copy);
3165    // live at this point:  G3_src_klass, G4_dst_klass
3166    {
3167      // Before looking at dst.length, make sure dst is also an objArray.
3168      // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
3169      __ cmp(G5_lh,                    O5_temp);
3170      __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
3171
3172      // It is safe to examine both src.length and dst.length.
3173      __ delayed();                             // match next insn to prev branch
3174      arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3175                             O5_temp, G5_lh, L_failed);
3176
3177      // Marshal the base address arguments now, freeing registers.
3178      __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3179      __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3180      __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3181      __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3182      __ add(src, src_pos, from);               // src_addr
3183      __ add(dst, dst_pos, to);                 // dst_addr
3184      __ signx(length, count);                  // length (reloaded)
3185
3186      Register sco_temp = O3;                   // this register is free now
3187      assert_different_registers(from, to, count, sco_temp,
3188                                 G4_dst_klass, G3_src_klass);
3189
3190      // Generate the type check.
3191      int sco_offset = in_bytes(Klass::super_check_offset_offset());
3192      __ lduw(G4_dst_klass, sco_offset, sco_temp);
3193      generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
3194                          O5_temp, L_plain_copy);
3195
3196      // Fetch destination element klass from the ObjArrayKlass header.
3197      int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3198
3199      // the checkcast_copy loop needs two extra arguments:
3200      __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
3201      // lduw(O4, sco_offset, O3);              // sco of elem klass
3202
3203      __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
3204      __ delayed()->lduw(O4, sco_offset, O3);
3205    }
3206
3207  __ BIND(L_failed);
3208    __ retl();
3209    __ delayed()->sub(G0, 1, O0); // return -1
3210    return start;
3211  }
3212
3213  //
3214  //  Generate stub for heap zeroing.
3215  //  "to" address is aligned to jlong (8 bytes).
3216  //
3217  // Arguments for generated stub:
3218  //      to:    O0
3219  //      count: O1 treated as signed (count of HeapWord)
3220  //             count could be 0
3221  //
3222  address generate_zero_aligned_words(const char* name) {
3223    __ align(CodeEntryAlignment);
3224    StubCodeMark mark(this, "StubRoutines", name);
3225    address start = __ pc();
3226
3227    const Register to    = O0;   // source array address
3228    const Register count = O1;   // HeapWords count
3229    const Register temp  = O2;   // scratch
3230
3231    Label Ldone;
3232    __ sllx(count, LogHeapWordSize, count); // to bytes count
3233    // Use BIS for zeroing
3234    __ bis_zeroing(to, count, temp, Ldone);
3235    __ bind(Ldone);
3236    __ retl();
3237    __ delayed()->nop();
3238    return start;
3239}
3240
3241  void generate_arraycopy_stubs() {
3242    address entry;
3243    address entry_jbyte_arraycopy;
3244    address entry_jshort_arraycopy;
3245    address entry_jint_arraycopy;
3246    address entry_oop_arraycopy;
3247    address entry_jlong_arraycopy;
3248    address entry_checkcast_arraycopy;
3249
3250    //*** jbyte
3251    // Always need aligned and unaligned versions
3252    StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
3253                                                                                  "jbyte_disjoint_arraycopy");
3254    StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
3255                                                                                  &entry_jbyte_arraycopy,
3256                                                                                  "jbyte_arraycopy");
3257    StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
3258                                                                                  "arrayof_jbyte_disjoint_arraycopy");
3259    StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
3260                                                                                  "arrayof_jbyte_arraycopy");
3261
3262    //*** jshort
3263    // Always need aligned and unaligned versions
3264    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
3265                                                                                    "jshort_disjoint_arraycopy");
3266    StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
3267                                                                                    &entry_jshort_arraycopy,
3268                                                                                    "jshort_arraycopy");
3269    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
3270                                                                                    "arrayof_jshort_disjoint_arraycopy");
3271    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
3272                                                                                    "arrayof_jshort_arraycopy");
3273
3274    //*** jint
3275    // Aligned versions
3276    StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
3277                                                                                "arrayof_jint_disjoint_arraycopy");
3278    StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
3279                                                                                "arrayof_jint_arraycopy");
3280#ifdef _LP64
3281    // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3282    // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
3283    StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
3284                                                                                "jint_disjoint_arraycopy");
3285    StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
3286                                                                                &entry_jint_arraycopy,
3287                                                                                "jint_arraycopy");
3288#else
3289    // In 32 bit jints are always HeapWordSize aligned, so always use the aligned version
3290    // (in fact in 32bit we always have a pre-loop part even in the aligned version,
3291    //  because it uses 64-bit loads/stores, so the aligned flag is actually ignored).
3292    StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy;
3293    StubRoutines::_jint_arraycopy          = StubRoutines::_arrayof_jint_arraycopy;
3294#endif
3295
3296
3297    //*** jlong
3298    // It is always aligned
3299    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
3300                                                                                  "arrayof_jlong_disjoint_arraycopy");
3301    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
3302                                                                                  "arrayof_jlong_arraycopy");
3303    StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3304    StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
3305
3306
3307    //*** oops
3308    // Aligned versions
3309    StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, &entry,
3310                                                                                      "arrayof_oop_disjoint_arraycopy");
3311    StubRoutines::_arrayof_oop_arraycopy                 = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
3312                                                                                      "arrayof_oop_arraycopy");
3313    // Aligned versions without pre-barriers
3314    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
3315                                                                                      "arrayof_oop_disjoint_arraycopy_uninit",
3316                                                                                      /*dest_uninitialized*/true);
3317    StubRoutines::_arrayof_oop_arraycopy_uninit          = generate_conjoint_oop_copy(true, entry, NULL,
3318                                                                                      "arrayof_oop_arraycopy_uninit",
3319                                                                                      /*dest_uninitialized*/true);
3320#ifdef _LP64
3321    if (UseCompressedOops) {
3322      // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
3323      StubRoutines::_oop_disjoint_arraycopy            = generate_disjoint_oop_copy(false, &entry,
3324                                                                                    "oop_disjoint_arraycopy");
3325      StubRoutines::_oop_arraycopy                     = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
3326                                                                                    "oop_arraycopy");
3327      // Unaligned versions without pre-barriers
3328      StubRoutines::_oop_disjoint_arraycopy_uninit     = generate_disjoint_oop_copy(false, &entry,
3329                                                                                    "oop_disjoint_arraycopy_uninit",
3330                                                                                    /*dest_uninitialized*/true);
3331      StubRoutines::_oop_arraycopy_uninit              = generate_conjoint_oop_copy(false, entry, NULL,
3332                                                                                    "oop_arraycopy_uninit",
3333                                                                                    /*dest_uninitialized*/true);
3334    } else
3335#endif
3336    {
3337      // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
3338      StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3339      StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
3340      StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3341      StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
3342    }
3343
3344    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3345    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3346                                                                        /*dest_uninitialized*/true);
3347
3348    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3349                                                              entry_jbyte_arraycopy,
3350                                                              entry_jshort_arraycopy,
3351                                                              entry_jint_arraycopy,
3352                                                              entry_jlong_arraycopy);
3353    StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3354                                                               entry_jbyte_arraycopy,
3355                                                               entry_jshort_arraycopy,
3356                                                               entry_jint_arraycopy,
3357                                                               entry_oop_arraycopy,
3358                                                               entry_jlong_arraycopy,
3359                                                               entry_checkcast_arraycopy);
3360
3361    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3362    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3363    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3364    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3365    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3366    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3367
3368    if (UseBlockZeroing) {
3369      StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
3370    }
3371  }
3372
3373  void generate_initial() {
3374    // Generates all stubs and initializes the entry points
3375
3376    //------------------------------------------------------------------------------------------------------------------------
3377    // entry points that exist in all platforms
3378    // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
3379    //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
3380    StubRoutines::_forward_exception_entry                 = generate_forward_exception();
3381
3382    StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
3383    StubRoutines::_catch_exception_entry                   = generate_catch_exception();
3384
3385    //------------------------------------------------------------------------------------------------------------------------
3386    // entry points that are platform specific
3387    StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
3388
3389    StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
3390    StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
3391
3392#if !defined(COMPILER2) && !defined(_LP64)
3393    StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
3394    StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
3395    StubRoutines::_atomic_add_entry          = generate_atomic_add();
3396    StubRoutines::_atomic_xchg_ptr_entry     = StubRoutines::_atomic_xchg_entry;
3397    StubRoutines::_atomic_cmpxchg_ptr_entry  = StubRoutines::_atomic_cmpxchg_entry;
3398    StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
3399    StubRoutines::_atomic_add_ptr_entry      = StubRoutines::_atomic_add_entry;
3400#endif  // COMPILER2 !=> _LP64
3401
3402    // Build this early so it's available for the interpreter.
3403    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
3404  }
3405
3406
3407  void generate_all() {
3408    // Generates all stubs and initializes the entry points
3409
3410    // Generate partial_subtype_check first here since its code depends on
3411    // UseZeroBaseCompressedOops which is defined after heap initialization.
3412    StubRoutines::Sparc::_partial_subtype_check                = generate_partial_subtype_check();
3413    // These entry points require SharedInfo::stack0 to be set up in non-core builds
3414    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
3415    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
3416    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
3417
3418    StubRoutines::_handler_for_unsafe_access_entry =
3419      generate_handler_for_unsafe_access();
3420
3421    // support for verify_oop (must happen after universe_init)
3422    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop_subroutine();
3423
3424    // arraycopy stubs used by compilers
3425    generate_arraycopy_stubs();
3426
3427    // Don't initialize the platform math functions since sparc
3428    // doesn't have intrinsics for these operations.
3429  }
3430
3431
3432 public:
3433  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3434    // replace the standard masm with a special one:
3435    _masm = new MacroAssembler(code);
3436
3437    _stub_count = !all ? 0x100 : 0x200;
3438    if (all) {
3439      generate_all();
3440    } else {
3441      generate_initial();
3442    }
3443
3444    // make sure this stub is available for all local calls
3445    if (_atomic_add_stub.is_unbound()) {
3446      // generate a second time, if necessary
3447      (void) generate_atomic_add();
3448    }
3449  }
3450
3451
3452 private:
3453  int _stub_count;
3454  void stub_prolog(StubCodeDesc* cdesc) {
3455    # ifdef ASSERT
3456      // put extra information in the stub code, to make it more readable
3457#ifdef _LP64
3458// Write the high part of the address
3459// [RGV] Check if there is a dependency on the size of this prolog
3460      __ emit_data((intptr_t)cdesc >> 32,    relocInfo::none);
3461#endif
3462      __ emit_data((intptr_t)cdesc,    relocInfo::none);
3463      __ emit_data(++_stub_count, relocInfo::none);
3464    # endif
3465    align(true);
3466  }
3467
3468  void align(bool at_header = false) {
3469    // %%%%% move this constant somewhere else
3470    // UltraSPARC cache line size is 8 instructions:
3471    const unsigned int icache_line_size = 32;
3472    const unsigned int icache_half_line_size = 16;
3473
3474    if (at_header) {
3475      while ((intptr_t)(__ pc()) % icache_line_size != 0) {
3476        __ emit_data(0, relocInfo::none);
3477      }
3478    } else {
3479      while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
3480        __ nop();
3481      }
3482    }
3483  }
3484
3485}; // end class declaration
3486
3487void StubGenerator_generate(CodeBuffer* code, bool all) {
3488  StubGenerator g(code, all);
3489}
3490