stubGenerator_sparc.cpp revision 0:a61af66fc99e
1/*
2 * Copyright 1997-2007 Sun Microsystems, Inc.  All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
20 * CA 95054 USA or visit www.sun.com if you need additional information or
21 * have any questions.
22 *
23 */
24
25#include "incls/_precompiled.incl"
26#include "incls/_stubGenerator_sparc.cpp.incl"
27
28// Declaration and definition of StubGenerator (no .hpp file).
29// For a more detailed description of the stub routine structure
30// see the comment in stubRoutines.hpp.
31
32#define __ _masm->
33
34#ifdef PRODUCT
35#define BLOCK_COMMENT(str) /* nothing */
36#else
37#define BLOCK_COMMENT(str) __ block_comment(str)
38#endif
39
40#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
41
42// Note:  The register L7 is used as L7_thread_cache, and may not be used
43//        any other way within this module.
44
45
46static const Register& Lstub_temp = L2;
47
48// -------------------------------------------------------------------------------------------------------------------------
49// Stub Code definitions
50
51static address handle_unsafe_access() {
52  JavaThread* thread = JavaThread::current();
53  address pc  = thread->saved_exception_pc();
54  address npc = thread->saved_exception_npc();
55  // pc is the instruction which we must emulate
56  // doing a no-op is fine:  return garbage from the load
57
58  // request an async exception
59  thread->set_pending_unsafe_access_error();
60
61  // return address of next instruction to execute
62  return npc;
63}
64
65class StubGenerator: public StubCodeGenerator {
66 private:
67
68#ifdef PRODUCT
69#define inc_counter_np(a,b,c) (0)
70#else
71  void inc_counter_np_(int& counter, Register t1, Register t2) {
72    Address counter_addr(t2, (address) &counter);
73    __ sethi(counter_addr);
74    __ ld(counter_addr, t1);
75    __ inc(t1);
76    __ st(t1, counter_addr);
77  }
78#define inc_counter_np(counter, t1, t2) \
79  BLOCK_COMMENT("inc_counter " #counter); \
80  inc_counter_np_(counter, t1, t2);
81#endif
82
83  //----------------------------------------------------------------------------------------------------
84  // Call stubs are used to call Java from C
85
86  address generate_call_stub(address& return_pc) {
87    StubCodeMark mark(this, "StubRoutines", "call_stub");
88    address start = __ pc();
89
90    // Incoming arguments:
91    //
92    // o0         : call wrapper address
93    // o1         : result (address)
94    // o2         : result type
95    // o3         : method
96    // o4         : (interpreter) entry point
97    // o5         : parameters (address)
98    // [sp + 0x5c]: parameter size (in words)
99    // [sp + 0x60]: thread
100    //
101    // +---------------+ <--- sp + 0
102    // |               |
103    // . reg save area .
104    // |               |
105    // +---------------+ <--- sp + 0x40
106    // |               |
107    // . extra 7 slots .
108    // |               |
109    // +---------------+ <--- sp + 0x5c
110    // |  param. size  |
111    // +---------------+ <--- sp + 0x60
112    // |    thread     |
113    // +---------------+
114    // |               |
115
116    // note: if the link argument position changes, adjust
117    //       the code in frame::entry_frame_call_wrapper()
118
119    const Argument link           = Argument(0, false); // used only for GC
120    const Argument result         = Argument(1, false);
121    const Argument result_type    = Argument(2, false);
122    const Argument method         = Argument(3, false);
123    const Argument entry_point    = Argument(4, false);
124    const Argument parameters     = Argument(5, false);
125    const Argument parameter_size = Argument(6, false);
126    const Argument thread         = Argument(7, false);
127
128    // setup thread register
129    __ ld_ptr(thread.as_address(), G2_thread);
130
131#ifdef ASSERT
132    // make sure we have no pending exceptions
133    { const Register t = G3_scratch;
134      Label L;
135      __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
136      __ br_null(t, false, Assembler::pt, L);
137      __ delayed()->nop();
138      __ stop("StubRoutines::call_stub: entered with pending exception");
139      __ bind(L);
140    }
141#endif
142
143    // create activation frame & allocate space for parameters
144    { const Register t = G3_scratch;
145      __ ld_ptr(parameter_size.as_address(), t);                // get parameter size (in words)
146      __ add(t, frame::memory_parameter_word_sp_offset, t);     // add space for save area (in words)
147      __ round_to(t, WordsPerLong);                             // make sure it is multiple of 2 (in words)
148      __ sll(t, Interpreter::logStackElementSize(), t);                    // compute number of bytes
149      __ neg(t);                                                // negate so it can be used with save
150      __ save(SP, t, SP);                                       // setup new frame
151    }
152
153    // +---------------+ <--- sp + 0
154    // |               |
155    // . reg save area .
156    // |               |
157    // +---------------+ <--- sp + 0x40
158    // |               |
159    // . extra 7 slots .
160    // |               |
161    // +---------------+ <--- sp + 0x5c
162    // |  empty slot   |      (only if parameter size is even)
163    // +---------------+
164    // |               |
165    // .  parameters   .
166    // |               |
167    // +---------------+ <--- fp + 0
168    // |               |
169    // . reg save area .
170    // |               |
171    // +---------------+ <--- fp + 0x40
172    // |               |
173    // . extra 7 slots .
174    // |               |
175    // +---------------+ <--- fp + 0x5c
176    // |  param. size  |
177    // +---------------+ <--- fp + 0x60
178    // |    thread     |
179    // +---------------+
180    // |               |
181
182    // pass parameters if any
183    BLOCK_COMMENT("pass parameters if any");
184    { const Register src = parameters.as_in().as_register();
185      const Register dst = Lentry_args;
186      const Register tmp = G3_scratch;
187      const Register cnt = G4_scratch;
188
189      // test if any parameters & setup of Lentry_args
190      Label exit;
191      __ ld_ptr(parameter_size.as_in().as_address(), cnt);      // parameter counter
192      __ add( FP, STACK_BIAS, dst );
193      __ tst(cnt);
194      __ br(Assembler::zero, false, Assembler::pn, exit);
195      __ delayed()->sub(dst, BytesPerWord, dst);                 // setup Lentry_args
196
197      // copy parameters if any
198      Label loop;
199      __ BIND(loop);
200      // Store tag first.
201      if (TaggedStackInterpreter) {
202        __ ld_ptr(src, 0, tmp);
203        __ add(src, BytesPerWord, src);  // get next
204        __ st_ptr(tmp, dst, Interpreter::tag_offset_in_bytes());
205      }
206      // Store parameter value
207      __ ld_ptr(src, 0, tmp);
208      __ add(src, BytesPerWord, src);
209      __ st_ptr(tmp, dst, Interpreter::value_offset_in_bytes());
210      __ deccc(cnt);
211      __ br(Assembler::greater, false, Assembler::pt, loop);
212      __ delayed()->sub(dst, Interpreter::stackElementSize(), dst);
213
214      // done
215      __ BIND(exit);
216    }
217
218    // setup parameters, method & call Java function
219#ifdef ASSERT
220    // layout_activation_impl checks it's notion of saved SP against
221    // this register, so if this changes update it as well.
222    const Register saved_SP = Lscratch;
223    __ mov(SP, saved_SP);                               // keep track of SP before call
224#endif
225
226    // setup parameters
227    const Register t = G3_scratch;
228    __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
229    __ sll(t, Interpreter::logStackElementSize(), t);            // compute number of bytes
230    __ sub(FP, t, Gargs);                              // setup parameter pointer
231#ifdef _LP64
232    __ add( Gargs, STACK_BIAS, Gargs );                // Account for LP64 stack bias
233#endif
234    __ mov(SP, O5_savedSP);
235
236
237    // do the call
238    //
239    // the following register must be setup:
240    //
241    // G2_thread
242    // G5_method
243    // Gargs
244    BLOCK_COMMENT("call Java function");
245    __ jmpl(entry_point.as_in().as_register(), G0, O7);
246    __ delayed()->mov(method.as_in().as_register(), G5_method);   // setup method
247
248    BLOCK_COMMENT("call_stub_return_address:");
249    return_pc = __ pc();
250
251    // The callee, if it wasn't interpreted, can return with SP changed so
252    // we can no longer assert of change of SP.
253
254    // store result depending on type
255    // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
256    //  is treated as T_INT)
257    { const Register addr = result     .as_in().as_register();
258      const Register type = result_type.as_in().as_register();
259      Label is_long, is_float, is_double, is_object, exit;
260      __            cmp(type, T_OBJECT);  __ br(Assembler::equal, false, Assembler::pn, is_object);
261      __ delayed()->cmp(type, T_FLOAT);   __ br(Assembler::equal, false, Assembler::pn, is_float);
262      __ delayed()->cmp(type, T_DOUBLE);  __ br(Assembler::equal, false, Assembler::pn, is_double);
263      __ delayed()->cmp(type, T_LONG);    __ br(Assembler::equal, false, Assembler::pn, is_long);
264      __ delayed()->nop();
265
266      // store int result
267      __ st(O0, addr, G0);
268
269      __ BIND(exit);
270      __ ret();
271      __ delayed()->restore();
272
273      __ BIND(is_object);
274      __ ba(false, exit);
275      __ delayed()->st_ptr(O0, addr, G0);
276
277      __ BIND(is_float);
278      __ ba(false, exit);
279      __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
280
281      __ BIND(is_double);
282      __ ba(false, exit);
283      __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
284
285      __ BIND(is_long);
286#ifdef _LP64
287      __ ba(false, exit);
288      __ delayed()->st_long(O0, addr, G0);      // store entire long
289#else
290#if defined(COMPILER2)
291  // All return values are where we want them, except for Longs.  C2 returns
292  // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1.
293  // Since the interpreter will return longs in G1 and O0/O1 in the 32bit
294  // build we simply always use G1.
295  // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to
296  // do this here. Unfortunately if we did a rethrow we'd see an machepilog node
297  // first which would move g1 -> O0/O1 and destroy the exception we were throwing.
298
299      __ ba(false, exit);
300      __ delayed()->stx(G1, addr, G0);  // store entire long
301#else
302      __ st(O1, addr, BytesPerInt);
303      __ ba(false, exit);
304      __ delayed()->st(O0, addr, G0);
305#endif /* COMPILER2 */
306#endif /* _LP64 */
307     }
308     return start;
309  }
310
311
312  //----------------------------------------------------------------------------------------------------
313  // Return point for a Java call if there's an exception thrown in Java code.
314  // The exception is caught and transformed into a pending exception stored in
315  // JavaThread that can be tested from within the VM.
316  //
317  // Oexception: exception oop
318
319  address generate_catch_exception() {
320    StubCodeMark mark(this, "StubRoutines", "catch_exception");
321
322    address start = __ pc();
323    // verify that thread corresponds
324    __ verify_thread();
325
326    const Register& temp_reg = Gtemp;
327    Address pending_exception_addr    (G2_thread, 0, in_bytes(Thread::pending_exception_offset()));
328    Address exception_file_offset_addr(G2_thread, 0, in_bytes(Thread::exception_file_offset   ()));
329    Address exception_line_offset_addr(G2_thread, 0, in_bytes(Thread::exception_line_offset   ()));
330
331    // set pending exception
332    __ verify_oop(Oexception);
333    __ st_ptr(Oexception, pending_exception_addr);
334    __ set((intptr_t)__FILE__, temp_reg);
335    __ st_ptr(temp_reg, exception_file_offset_addr);
336    __ set((intptr_t)__LINE__, temp_reg);
337    __ st(temp_reg, exception_line_offset_addr);
338
339    // complete return to VM
340    assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
341
342    Address stub_ret(temp_reg, StubRoutines::_call_stub_return_address);
343    __ jump_to(stub_ret);
344    __ delayed()->nop();
345
346    return start;
347  }
348
349
350  //----------------------------------------------------------------------------------------------------
351  // Continuation point for runtime calls returning with a pending exception
352  // The pending exception check happened in the runtime or native call stub
353  // The pending exception in Thread is converted into a Java-level exception
354  //
355  // Contract with Java-level exception handler: O0 = exception
356  //                                             O1 = throwing pc
357
358  address generate_forward_exception() {
359    StubCodeMark mark(this, "StubRoutines", "forward_exception");
360    address start = __ pc();
361
362    // Upon entry, O7 has the return address returning into Java
363    // (interpreted or compiled) code; i.e. the return address
364    // becomes the throwing pc.
365
366    const Register& handler_reg = Gtemp;
367
368    Address exception_addr (G2_thread, 0, in_bytes(Thread::pending_exception_offset()));
369
370#ifdef ASSERT
371    // make sure that this code is only executed if there is a pending exception
372    { Label L;
373      __ ld_ptr(exception_addr, Gtemp);
374      __ br_notnull(Gtemp, false, Assembler::pt, L);
375      __ delayed()->nop();
376      __ stop("StubRoutines::forward exception: no pending exception (1)");
377      __ bind(L);
378    }
379#endif
380
381    // compute exception handler into handler_reg
382    __ get_thread();
383    __ ld_ptr(exception_addr, Oexception);
384    __ verify_oop(Oexception);
385    __ save_frame(0);             // compensates for compiler weakness
386    __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
387    BLOCK_COMMENT("call exception_handler_for_return_address");
388    __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), Lscratch);
389    __ mov(O0, handler_reg);
390    __ restore();                 // compensates for compiler weakness
391
392    __ ld_ptr(exception_addr, Oexception);
393    __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
394
395#ifdef ASSERT
396    // make sure exception is set
397    { Label L;
398      __ br_notnull(Oexception, false, Assembler::pt, L);
399      __ delayed()->nop();
400      __ stop("StubRoutines::forward exception: no pending exception (2)");
401      __ bind(L);
402    }
403#endif
404    // jump to exception handler
405    __ jmp(handler_reg, 0);
406    // clear pending exception
407    __ delayed()->st_ptr(G0, exception_addr);
408
409    return start;
410  }
411
412
413  //------------------------------------------------------------------------------------------------------------------------
414  // Continuation point for throwing of implicit exceptions that are not handled in
415  // the current activation. Fabricates an exception oop and initiates normal
416  // exception dispatching in this frame. Only callee-saved registers are preserved
417  // (through the normal register window / RegisterMap handling).
418  // If the compiler needs all registers to be preserved between the fault
419  // point and the exception handler then it must assume responsibility for that in
420  // AbstractCompiler::continuation_for_implicit_null_exception or
421  // continuation_for_implicit_division_by_zero_exception. All other implicit
422  // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
423  // either at call sites or otherwise assume that stack unwinding will be initiated,
424  // so caller saved registers were assumed volatile in the compiler.
425
426  // Note that we generate only this stub into a RuntimeStub, because it needs to be
427  // properly traversed and ignored during GC, so we change the meaning of the "__"
428  // macro within this method.
429#undef __
430#define __ masm->
431
432  address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc) {
433#ifdef ASSERT
434    int insts_size = VerifyThread ? 1 * K : 600;
435#else
436    int insts_size = VerifyThread ? 1 * K : 256;
437#endif /* ASSERT */
438    int locs_size  = 32;
439
440    CodeBuffer      code(name, insts_size, locs_size);
441    MacroAssembler* masm = new MacroAssembler(&code);
442
443    __ verify_thread();
444
445    // This is an inlined and slightly modified version of call_VM
446    // which has the ability to fetch the return PC out of thread-local storage
447    __ assert_not_delayed();
448
449    // Note that we always push a frame because on the SPARC
450    // architecture, for all of our implicit exception kinds at call
451    // sites, the implicit exception is taken before the callee frame
452    // is pushed.
453    __ save_frame(0);
454
455    int frame_complete = __ offset();
456
457    if (restore_saved_exception_pc) {
458      Address saved_exception_pc(G2_thread, 0, in_bytes(JavaThread::saved_exception_pc_offset()));
459      __ ld_ptr(saved_exception_pc, I7);
460      __ sub(I7, frame::pc_return_offset, I7);
461    }
462
463    // Note that we always have a runtime stub frame on the top of stack by this point
464    Register last_java_sp = SP;
465    // 64-bit last_java_sp is biased!
466    __ set_last_Java_frame(last_java_sp, G0);
467    if (VerifyThread)  __ mov(G2_thread, O0); // about to be smashed; pass early
468    __ save_thread(noreg);
469    // do the call
470    BLOCK_COMMENT("call runtime_entry");
471    __ call(runtime_entry, relocInfo::runtime_call_type);
472    if (!VerifyThread)
473      __ delayed()->mov(G2_thread, O0);  // pass thread as first argument
474    else
475      __ delayed()->nop();             // (thread already passed)
476    __ restore_thread(noreg);
477    __ reset_last_Java_frame();
478
479    // check for pending exceptions. use Gtemp as scratch register.
480#ifdef ASSERT
481    Label L;
482
483    Address exception_addr(G2_thread, 0, in_bytes(Thread::pending_exception_offset()));
484    Register scratch_reg = Gtemp;
485    __ ld_ptr(exception_addr, scratch_reg);
486    __ br_notnull(scratch_reg, false, Assembler::pt, L);
487    __ delayed()->nop();
488    __ should_not_reach_here();
489    __ bind(L);
490#endif // ASSERT
491    BLOCK_COMMENT("call forward_exception_entry");
492    __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
493    // we use O7 linkage so that forward_exception_entry has the issuing PC
494    __ delayed()->restore();
495
496    RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
497    return stub->entry_point();
498  }
499
500#undef __
501#define __ _masm->
502
503
504  // Generate a routine that sets all the registers so we
505  // can tell if the stop routine prints them correctly.
506  address generate_test_stop() {
507    StubCodeMark mark(this, "StubRoutines", "test_stop");
508    address start = __ pc();
509
510    int i;
511
512    __ save_frame(0);
513
514    static jfloat zero = 0.0, one = 1.0;
515
516    // put addr in L0, then load through L0 to F0
517    __ set((intptr_t)&zero, L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F0);
518    __ set((intptr_t)&one,  L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
519
520    // use add to put 2..18 in F2..F18
521    for ( i = 2;  i <= 18;  ++i ) {
522      __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1),  as_FloatRegister(i));
523    }
524
525    // Now put double 2 in F16, double 18 in F18
526    __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
527    __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
528
529    // use add to put 20..32 in F20..F32
530    for (i = 20; i < 32; i += 2) {
531      __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2),  as_FloatRegister(i));
532    }
533
534    // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
535    for ( i = 0; i < 8; ++i ) {
536      if (i < 6) {
537        __ set(     i, as_iRegister(i));
538        __ set(16 + i, as_oRegister(i));
539        __ set(24 + i, as_gRegister(i));
540      }
541      __ set( 8 + i, as_lRegister(i));
542    }
543
544    __ stop("testing stop");
545
546
547    __ ret();
548    __ delayed()->restore();
549
550    return start;
551  }
552
553
554  address generate_stop_subroutine() {
555    StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
556    address start = __ pc();
557
558    __ stop_subroutine();
559
560    return start;
561  }
562
563  address generate_flush_callers_register_windows() {
564    StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
565    address start = __ pc();
566
567    __ flush_windows();
568    __ retl(false);
569    __ delayed()->add( FP, STACK_BIAS, O0 );
570    // The returned value must be a stack pointer whose register save area
571    // is flushed, and will stay flushed while the caller executes.
572
573    return start;
574  }
575
576  // Helper functions for v8 atomic operations.
577  //
578  void get_v8_oop_lock_ptr(Register lock_ptr_reg, Register mark_oop_reg, Register scratch_reg) {
579    if (mark_oop_reg == noreg) {
580      address lock_ptr = (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr();
581      __ set((intptr_t)lock_ptr, lock_ptr_reg);
582    } else {
583      assert(scratch_reg != noreg, "just checking");
584      address lock_ptr = (address)StubRoutines::Sparc::_v8_oop_lock_cache;
585      __ set((intptr_t)lock_ptr, lock_ptr_reg);
586      __ and3(mark_oop_reg, StubRoutines::Sparc::v8_oop_lock_mask_in_place, scratch_reg);
587      __ add(lock_ptr_reg, scratch_reg, lock_ptr_reg);
588    }
589  }
590
591  void generate_v8_lock_prologue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
592
593    get_v8_oop_lock_ptr(lock_ptr_reg, mark_oop_reg, scratch_reg);
594    __ set(StubRoutines::Sparc::locked, lock_reg);
595    // Initialize yield counter
596    __ mov(G0,yield_reg);
597
598    __ BIND(retry);
599    __ cmp(yield_reg, V8AtomicOperationUnderLockSpinCount);
600    __ br(Assembler::less, false, Assembler::pt, dontyield);
601    __ delayed()->nop();
602
603    // This code can only be called from inside the VM, this
604    // stub is only invoked from Atomic::add().  We do not
605    // want to use call_VM, because _last_java_sp and such
606    // must already be set.
607    //
608    // Save the regs and make space for a C call
609    __ save(SP, -96, SP);
610    __ save_all_globals_into_locals();
611    BLOCK_COMMENT("call os::naked_sleep");
612    __ call(CAST_FROM_FN_PTR(address, os::naked_sleep));
613    __ delayed()->nop();
614    __ restore_globals_from_locals();
615    __ restore();
616    // reset the counter
617    __ mov(G0,yield_reg);
618
619    __ BIND(dontyield);
620
621    // try to get lock
622    __ swap(lock_ptr_reg, 0, lock_reg);
623
624    // did we get the lock?
625    __ cmp(lock_reg, StubRoutines::Sparc::unlocked);
626    __ br(Assembler::notEqual, true, Assembler::pn, retry);
627    __ delayed()->add(yield_reg,1,yield_reg);
628
629    // yes, got lock. do the operation here.
630  }
631
632  void generate_v8_lock_epilogue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
633    __ st(lock_reg, lock_ptr_reg, 0); // unlock
634  }
635
636  // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
637  //
638  // Arguments :
639  //
640  //      exchange_value: O0
641  //      dest:           O1
642  //
643  // Results:
644  //
645  //     O0: the value previously stored in dest
646  //
647  address generate_atomic_xchg() {
648    StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
649    address start = __ pc();
650
651    if (UseCASForSwap) {
652      // Use CAS instead of swap, just in case the MP hardware
653      // prefers to work with just one kind of synch. instruction.
654      Label retry;
655      __ BIND(retry);
656      __ mov(O0, O3);       // scratch copy of exchange value
657      __ ld(O1, 0, O2);     // observe the previous value
658      // try to replace O2 with O3
659      __ cas_under_lock(O1, O2, O3,
660      (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
661      __ cmp(O2, O3);
662      __ br(Assembler::notEqual, false, Assembler::pn, retry);
663      __ delayed()->nop();
664
665      __ retl(false);
666      __ delayed()->mov(O2, O0);  // report previous value to caller
667
668    } else {
669      if (VM_Version::v9_instructions_work()) {
670        __ retl(false);
671        __ delayed()->swap(O1, 0, O0);
672      } else {
673        const Register& lock_reg = O2;
674        const Register& lock_ptr_reg = O3;
675        const Register& yield_reg = O4;
676
677        Label retry;
678        Label dontyield;
679
680        generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
681        // got the lock, do the swap
682        __ swap(O1, 0, O0);
683
684        generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
685        __ retl(false);
686        __ delayed()->nop();
687      }
688    }
689
690    return start;
691  }
692
693
694  // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
695  //
696  // Arguments :
697  //
698  //      exchange_value: O0
699  //      dest:           O1
700  //      compare_value:  O2
701  //
702  // Results:
703  //
704  //     O0: the value previously stored in dest
705  //
706  // Overwrites (v8): O3,O4,O5
707  //
708  address generate_atomic_cmpxchg() {
709    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
710    address start = __ pc();
711
712    // cmpxchg(dest, compare_value, exchange_value)
713    __ cas_under_lock(O1, O2, O0,
714      (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
715    __ retl(false);
716    __ delayed()->nop();
717
718    return start;
719  }
720
721  // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
722  //
723  // Arguments :
724  //
725  //      exchange_value: O1:O0
726  //      dest:           O2
727  //      compare_value:  O4:O3
728  //
729  // Results:
730  //
731  //     O1:O0: the value previously stored in dest
732  //
733  // This only works on V9, on V8 we don't generate any
734  // code and just return NULL.
735  //
736  // Overwrites: G1,G2,G3
737  //
738  address generate_atomic_cmpxchg_long() {
739    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
740    address start = __ pc();
741
742    if (!VM_Version::supports_cx8())
743        return NULL;;
744    __ sllx(O0, 32, O0);
745    __ srl(O1, 0, O1);
746    __ or3(O0,O1,O0);      // O0 holds 64-bit value from compare_value
747    __ sllx(O3, 32, O3);
748    __ srl(O4, 0, O4);
749    __ or3(O3,O4,O3);     // O3 holds 64-bit value from exchange_value
750    __ casx(O2, O3, O0);
751    __ srl(O0, 0, O1);    // unpacked return value in O1:O0
752    __ retl(false);
753    __ delayed()->srlx(O0, 32, O0);
754
755    return start;
756  }
757
758
759  // Support for jint Atomic::add(jint add_value, volatile jint* dest).
760  //
761  // Arguments :
762  //
763  //      add_value: O0   (e.g., +1 or -1)
764  //      dest:      O1
765  //
766  // Results:
767  //
768  //     O0: the new value stored in dest
769  //
770  // Overwrites (v9): O3
771  // Overwrites (v8): O3,O4,O5
772  //
773  address generate_atomic_add() {
774    StubCodeMark mark(this, "StubRoutines", "atomic_add");
775    address start = __ pc();
776    __ BIND(_atomic_add_stub);
777
778    if (VM_Version::v9_instructions_work()) {
779      Label(retry);
780      __ BIND(retry);
781
782      __ lduw(O1, 0, O2);
783      __ add(O0,   O2, O3);
784      __ cas(O1,   O2, O3);
785      __ cmp(      O2, O3);
786      __ br(Assembler::notEqual, false, Assembler::pn, retry);
787      __ delayed()->nop();
788      __ retl(false);
789      __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
790    } else {
791      const Register& lock_reg = O2;
792      const Register& lock_ptr_reg = O3;
793      const Register& value_reg = O4;
794      const Register& yield_reg = O5;
795
796      Label(retry);
797      Label(dontyield);
798
799      generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
800      // got lock, do the increment
801      __ ld(O1, 0, value_reg);
802      __ add(O0, value_reg, value_reg);
803      __ st(value_reg, O1, 0);
804
805      // %%% only for RMO and PSO
806      __ membar(Assembler::StoreStore);
807
808      generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
809
810      __ retl(false);
811      __ delayed()->mov(value_reg, O0);
812    }
813
814    return start;
815  }
816  Label _atomic_add_stub;  // called from other stubs
817
818
819  // Support for void OrderAccess::fence().
820  //
821  address generate_fence() {
822    StubCodeMark mark(this, "StubRoutines", "fence");
823    address start = __ pc();
824
825    __ membar(Assembler::Membar_mask_bits(Assembler::LoadLoad  | Assembler::LoadStore |
826                                          Assembler::StoreLoad | Assembler::StoreStore));
827    __ retl(false);
828    __ delayed()->nop();
829
830    return start;
831  }
832
833
834  //------------------------------------------------------------------------------------------------------------------------
835  // The following routine generates a subroutine to throw an asynchronous
836  // UnknownError when an unsafe access gets a fault that could not be
837  // reasonably prevented by the programmer.  (Example: SIGBUS/OBJERR.)
838  //
839  // Arguments :
840  //
841  //      trapping PC:    O7
842  //
843  // Results:
844  //     posts an asynchronous exception, skips the trapping instruction
845  //
846
847  address generate_handler_for_unsafe_access() {
848    StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
849    address start = __ pc();
850
851    const int preserve_register_words = (64 * 2);
852    Address preserve_addr(FP, 0, (-preserve_register_words * wordSize) + STACK_BIAS);
853
854    Register Lthread = L7_thread_cache;
855    int i;
856
857    __ save_frame(0);
858    __ mov(G1, L1);
859    __ mov(G2, L2);
860    __ mov(G3, L3);
861    __ mov(G4, L4);
862    __ mov(G5, L5);
863    for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
864      __ stf(FloatRegisterImpl::D, as_FloatRegister(i), preserve_addr, i * wordSize);
865    }
866
867    address entry_point = CAST_FROM_FN_PTR(address, handle_unsafe_access);
868    BLOCK_COMMENT("call handle_unsafe_access");
869    __ call(entry_point, relocInfo::runtime_call_type);
870    __ delayed()->nop();
871
872    __ mov(L1, G1);
873    __ mov(L2, G2);
874    __ mov(L3, G3);
875    __ mov(L4, G4);
876    __ mov(L5, G5);
877    for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
878      __ ldf(FloatRegisterImpl::D, preserve_addr, as_FloatRegister(i), i * wordSize);
879    }
880
881    __ verify_thread();
882
883    __ jmp(O0, 0);
884    __ delayed()->restore();
885
886    return start;
887  }
888
889
890  // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
891  // Arguments :
892  //
893  //      ret  : O0, returned
894  //      icc/xcc: set as O0 (depending on wordSize)
895  //      sub  : O1, argument, not changed
896  //      super: O2, argument, not changed
897  //      raddr: O7, blown by call
898  address generate_partial_subtype_check() {
899    StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
900    address start = __ pc();
901    Label loop, miss;
902
903    // Compare super with sub directly, since super is not in its own SSA.
904    // The compiler used to emit this test, but we fold it in here,
905    // to increase overall code density, with no real loss of speed.
906    { Label L;
907      __ cmp(O1, O2);
908      __ brx(Assembler::notEqual, false, Assembler::pt, L);
909      __ delayed()->nop();
910      __ retl();
911      __ delayed()->addcc(G0,0,O0); // set Z flags, zero result
912      __ bind(L);
913    }
914
915#if defined(COMPILER2) && !defined(_LP64)
916    // Do not use a 'save' because it blows the 64-bit O registers.
917    __ add(SP,-4*wordSize,SP);  // Make space for 4 temps
918    __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize);
919    __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize);
920    __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize);
921    __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize);
922    Register Rret   = O0;
923    Register Rsub   = O1;
924    Register Rsuper = O2;
925#else
926    __ save_frame(0);
927    Register Rret   = I0;
928    Register Rsub   = I1;
929    Register Rsuper = I2;
930#endif
931
932    Register L0_ary_len = L0;
933    Register L1_ary_ptr = L1;
934    Register L2_super   = L2;
935    Register L3_index   = L3;
936
937    inc_counter_np(SharedRuntime::_partial_subtype_ctr, L0, L1);
938
939    __ ld_ptr( Rsub, sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes(), L3 );
940    __ lduw(L3,arrayOopDesc::length_offset_in_bytes(),L0_ary_len);
941    __ add(L3,arrayOopDesc::base_offset_in_bytes(T_OBJECT),L1_ary_ptr);
942    __ clr(L3_index);           // zero index
943    // Load a little early; will load 1 off the end of the array.
944    // Ok for now; revisit if we have other uses of this routine.
945    __ ld_ptr(L1_ary_ptr,0,L2_super);// Will load a little early
946    __ align(CodeEntryAlignment);
947
948    // The scan loop
949    __ BIND(loop);
950    __ add(L1_ary_ptr,wordSize,L1_ary_ptr); // Bump by OOP size
951    __ cmp(L3_index,L0_ary_len);
952    __ br(Assembler::equal,false,Assembler::pn,miss);
953    __ delayed()->inc(L3_index); // Bump index
954    __ subcc(L2_super,Rsuper,Rret);   // Check for match; zero in Rret for a hit
955    __ brx( Assembler::notEqual, false, Assembler::pt, loop );
956    __ delayed()->ld_ptr(L1_ary_ptr,0,L2_super); // Will load a little early
957
958    // Got a hit; report success; set cache.  Cache load doesn't
959    // happen here; for speed it is directly emitted by the compiler.
960    __ st_ptr( Rsuper, Rsub, sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes() );
961
962#if defined(COMPILER2) && !defined(_LP64)
963    __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
964    __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
965    __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
966    __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
967    __ retl();                  // Result in Rret is zero; flags set to Z
968    __ delayed()->add(SP,4*wordSize,SP);
969#else
970    __ ret();                   // Result in Rret is zero; flags set to Z
971    __ delayed()->restore();
972#endif
973
974    // Hit or miss falls through here
975    __ BIND(miss);
976    __ addcc(G0,1,Rret);        // set NZ flags, NZ result
977
978#if defined(COMPILER2) && !defined(_LP64)
979    __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
980    __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
981    __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
982    __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
983    __ retl();                  // Result in Rret is != 0; flags set to NZ
984    __ delayed()->add(SP,4*wordSize,SP);
985#else
986    __ ret();                   // Result in Rret is != 0; flags set to NZ
987    __ delayed()->restore();
988#endif
989
990    return start;
991  }
992
993
994  // Called from MacroAssembler::verify_oop
995  //
996  address generate_verify_oop_subroutine() {
997    StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
998
999    address start = __ pc();
1000
1001    __ verify_oop_subroutine();
1002
1003    return start;
1004  }
1005
1006  static address disjoint_byte_copy_entry;
1007  static address disjoint_short_copy_entry;
1008  static address disjoint_int_copy_entry;
1009  static address disjoint_long_copy_entry;
1010  static address disjoint_oop_copy_entry;
1011
1012  static address byte_copy_entry;
1013  static address short_copy_entry;
1014  static address int_copy_entry;
1015  static address long_copy_entry;
1016  static address oop_copy_entry;
1017
1018  static address checkcast_copy_entry;
1019
1020  //
1021  // Verify that a register contains clean 32-bits positive value
1022  // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
1023  //
1024  //  Input:
1025  //    Rint  -  32-bits value
1026  //    Rtmp  -  scratch
1027  //
1028  void assert_clean_int(Register Rint, Register Rtmp) {
1029#if defined(ASSERT) && defined(_LP64)
1030    __ signx(Rint, Rtmp);
1031    __ cmp(Rint, Rtmp);
1032    __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
1033#endif
1034  }
1035
1036  //
1037  //  Generate overlap test for array copy stubs
1038  //
1039  //  Input:
1040  //    O0    -  array1
1041  //    O1    -  array2
1042  //    O2    -  element count
1043  //
1044  //  Kills temps:  O3, O4
1045  //
1046  void array_overlap_test(address no_overlap_target, int log2_elem_size) {
1047    assert(no_overlap_target != NULL, "must be generated");
1048    array_overlap_test(no_overlap_target, NULL, log2_elem_size);
1049  }
1050  void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
1051    array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
1052  }
1053  void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
1054    const Register from       = O0;
1055    const Register to         = O1;
1056    const Register count      = O2;
1057    const Register to_from    = O3; // to - from
1058    const Register byte_count = O4; // count << log2_elem_size
1059
1060      __ subcc(to, from, to_from);
1061      __ sll_ptr(count, log2_elem_size, byte_count);
1062      if (NOLp == NULL)
1063        __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
1064      else
1065        __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
1066      __ delayed()->cmp(to_from, byte_count);
1067      if (NOLp == NULL)
1068        __ brx(Assembler::greaterEqual, false, Assembler::pt, no_overlap_target);
1069      else
1070        __ brx(Assembler::greaterEqual, false, Assembler::pt, (*NOLp));
1071      __ delayed()->nop();
1072  }
1073
1074  //
1075  //  Generate pre-write barrier for array.
1076  //
1077  //  Input:
1078  //     addr     - register containing starting address
1079  //     count    - register containing element count
1080  //     tmp      - scratch register
1081  //
1082  //  The input registers are overwritten.
1083  //
1084  void gen_write_ref_array_pre_barrier(Register addr, Register count) {
1085#if 0 // G1 only
1086    BarrierSet* bs = Universe::heap()->barrier_set();
1087    if (bs->has_write_ref_pre_barrier()) {
1088      assert(bs->has_write_ref_array_pre_opt(),
1089             "Else unsupported barrier set.");
1090
1091      assert(addr->is_global() && count->is_global(),
1092             "If not, then we have to fix this code to handle more "
1093             "general cases.");
1094      // Get some new fresh output registers.
1095      __ save_frame(0);
1096      // Save the necessary global regs... will be used after.
1097      __ mov(addr, L0);
1098      __ mov(count, L1);
1099
1100      __ mov(addr, O0);
1101      // Get the count into O1
1102      __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
1103      __ delayed()->mov(count, O1);
1104      __ mov(L0, addr);
1105      __ mov(L1, count);
1106      __ restore();
1107    }
1108#endif // 0
1109  }
1110
1111  //
1112  //  Generate post-write barrier for array.
1113  //
1114  //  Input:
1115  //     addr     - register containing starting address
1116  //     count    - register containing element count
1117  //     tmp      - scratch register
1118  //
1119  //  The input registers are overwritten.
1120  //
1121  void gen_write_ref_array_post_barrier(Register addr, Register count,
1122                                   Register tmp) {
1123    BarrierSet* bs = Universe::heap()->barrier_set();
1124
1125    switch (bs->kind()) {
1126#if 0 // G1 - only
1127      case BarrierSet::G1SATBCT:
1128      case BarrierSet::G1SATBCTLogging:
1129        {
1130          assert(addr->is_global() && count->is_global(),
1131                 "If not, then we have to fix this code to handle more "
1132                 "general cases.");
1133          // Get some new fresh output registers.
1134          __ save_frame(0);
1135          __ mov(addr, O0);
1136          __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
1137          __ delayed()->mov(count, O1);
1138          __ restore();
1139        }
1140        break;
1141#endif // 0 G1 - only
1142      case BarrierSet::CardTableModRef:
1143      case BarrierSet::CardTableExtension:
1144        {
1145          CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1146          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1147          assert_different_registers(addr, count, tmp);
1148
1149          Label L_loop;
1150
1151          __ sll_ptr(count, LogBytesPerOop, count);
1152          __ sub(count, BytesPerOop, count);
1153          __ add(count, addr, count);
1154          // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
1155          __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
1156          __ srl_ptr(count, CardTableModRefBS::card_shift, count);
1157          __ sub(count, addr, count);
1158          Address rs(tmp, (address)ct->byte_map_base);
1159          __ load_address(rs);
1160        __ BIND(L_loop);
1161          __ stb(G0, rs.base(), addr);
1162          __ subcc(count, 1, count);
1163          __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1164          __ delayed()->add(addr, 1, addr);
1165
1166          }
1167        break;
1168      case BarrierSet::ModRef:
1169        break;
1170      default      :
1171        ShouldNotReachHere();
1172
1173    }
1174
1175  }
1176
1177
1178  // Copy big chunks forward with shift
1179  //
1180  // Inputs:
1181  //   from      - source arrays
1182  //   to        - destination array aligned to 8-bytes
1183  //   count     - elements count to copy >= the count equivalent to 16 bytes
1184  //   count_dec - elements count's decrement equivalent to 16 bytes
1185  //   L_copy_bytes - copy exit label
1186  //
1187  void copy_16_bytes_forward_with_shift(Register from, Register to,
1188                     Register count, int count_dec, Label& L_copy_bytes) {
1189    Label L_loop, L_aligned_copy, L_copy_last_bytes;
1190
1191    // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1192      __ andcc(from, 7, G1); // misaligned bytes
1193      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1194      __ delayed()->nop();
1195
1196    const Register left_shift  = G1; // left  shift bit counter
1197    const Register right_shift = G5; // right shift bit counter
1198
1199      __ sll(G1, LogBitsPerByte, left_shift);
1200      __ mov(64, right_shift);
1201      __ sub(right_shift, left_shift, right_shift);
1202
1203    //
1204    // Load 2 aligned 8-bytes chunks and use one from previous iteration
1205    // to form 2 aligned 8-bytes chunks to store.
1206    //
1207      __ deccc(count, count_dec); // Pre-decrement 'count'
1208      __ andn(from, 7, from);     // Align address
1209      __ ldx(from, 0, O3);
1210      __ inc(from, 8);
1211      __ align(16);
1212    __ BIND(L_loop);
1213      __ ldx(from, 0, O4);
1214      __ deccc(count, count_dec); // Can we do next iteration after this one?
1215      __ ldx(from, 8, G4);
1216      __ inc(to, 16);
1217      __ inc(from, 16);
1218      __ sllx(O3, left_shift,  O3);
1219      __ srlx(O4, right_shift, G3);
1220      __ bset(G3, O3);
1221      __ stx(O3, to, -16);
1222      __ sllx(O4, left_shift,  O4);
1223      __ srlx(G4, right_shift, G3);
1224      __ bset(G3, O4);
1225      __ stx(O4, to, -8);
1226      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1227      __ delayed()->mov(G4, O3);
1228
1229      __ inccc(count, count_dec>>1 ); // + 8 bytes
1230      __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1231      __ delayed()->inc(count, count_dec>>1); // restore 'count'
1232
1233      // copy 8 bytes, part of them already loaded in O3
1234      __ ldx(from, 0, O4);
1235      __ inc(to, 8);
1236      __ inc(from, 8);
1237      __ sllx(O3, left_shift,  O3);
1238      __ srlx(O4, right_shift, G3);
1239      __ bset(O3, G3);
1240      __ stx(G3, to, -8);
1241
1242    __ BIND(L_copy_last_bytes);
1243      __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1244      __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1245      __ delayed()->sub(from, right_shift, from);       // restore address
1246
1247    __ BIND(L_aligned_copy);
1248  }
1249
1250  // Copy big chunks backward with shift
1251  //
1252  // Inputs:
1253  //   end_from  - source arrays end address
1254  //   end_to    - destination array end address aligned to 8-bytes
1255  //   count     - elements count to copy >= the count equivalent to 16 bytes
1256  //   count_dec - elements count's decrement equivalent to 16 bytes
1257  //   L_aligned_copy - aligned copy exit label
1258  //   L_copy_bytes   - copy exit label
1259  //
1260  void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
1261                     Register count, int count_dec,
1262                     Label& L_aligned_copy, Label& L_copy_bytes) {
1263    Label L_loop, L_copy_last_bytes;
1264
1265    // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1266      __ andcc(end_from, 7, G1); // misaligned bytes
1267      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1268      __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
1269
1270    const Register left_shift  = G1; // left  shift bit counter
1271    const Register right_shift = G5; // right shift bit counter
1272
1273      __ sll(G1, LogBitsPerByte, left_shift);
1274      __ mov(64, right_shift);
1275      __ sub(right_shift, left_shift, right_shift);
1276
1277    //
1278    // Load 2 aligned 8-bytes chunks and use one from previous iteration
1279    // to form 2 aligned 8-bytes chunks to store.
1280    //
1281      __ andn(end_from, 7, end_from);     // Align address
1282      __ ldx(end_from, 0, O3);
1283      __ align(16);
1284    __ BIND(L_loop);
1285      __ ldx(end_from, -8, O4);
1286      __ deccc(count, count_dec); // Can we do next iteration after this one?
1287      __ ldx(end_from, -16, G4);
1288      __ dec(end_to, 16);
1289      __ dec(end_from, 16);
1290      __ srlx(O3, right_shift, O3);
1291      __ sllx(O4, left_shift,  G3);
1292      __ bset(G3, O3);
1293      __ stx(O3, end_to, 8);
1294      __ srlx(O4, right_shift, O4);
1295      __ sllx(G4, left_shift,  G3);
1296      __ bset(G3, O4);
1297      __ stx(O4, end_to, 0);
1298      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1299      __ delayed()->mov(G4, O3);
1300
1301      __ inccc(count, count_dec>>1 ); // + 8 bytes
1302      __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1303      __ delayed()->inc(count, count_dec>>1); // restore 'count'
1304
1305      // copy 8 bytes, part of them already loaded in O3
1306      __ ldx(end_from, -8, O4);
1307      __ dec(end_to, 8);
1308      __ dec(end_from, 8);
1309      __ srlx(O3, right_shift, O3);
1310      __ sllx(O4, left_shift,  G3);
1311      __ bset(O3, G3);
1312      __ stx(G3, end_to, 0);
1313
1314    __ BIND(L_copy_last_bytes);
1315      __ srl(left_shift, LogBitsPerByte, left_shift);    // misaligned bytes
1316      __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1317      __ delayed()->add(end_from, left_shift, end_from); // restore address
1318  }
1319
1320  //
1321  //  Generate stub for disjoint byte copy.  If "aligned" is true, the
1322  //  "from" and "to" addresses are assumed to be heapword aligned.
1323  //
1324  // Arguments for generated stub:
1325  //      from:  O0
1326  //      to:    O1
1327  //      count: O2 treated as signed
1328  //
1329  address generate_disjoint_byte_copy(bool aligned, const char * name) {
1330    __ align(CodeEntryAlignment);
1331    StubCodeMark mark(this, "StubRoutines", name);
1332    address start = __ pc();
1333
1334    Label L_skip_alignment, L_align;
1335    Label L_copy_byte, L_copy_byte_loop, L_exit;
1336
1337    const Register from      = O0;   // source array address
1338    const Register to        = O1;   // destination array address
1339    const Register count     = O2;   // elements count
1340    const Register offset    = O5;   // offset from start of arrays
1341    // O3, O4, G3, G4 are used as temp registers
1342
1343    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1344
1345    if (!aligned)  disjoint_byte_copy_entry = __ pc();
1346    // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1347    if (!aligned)  BLOCK_COMMENT("Entry:");
1348
1349    // for short arrays, just do single element copy
1350    __ cmp(count, 23); // 16 + 7
1351    __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1352    __ delayed()->mov(G0, offset);
1353
1354    if (aligned) {
1355      // 'aligned' == true when it is known statically during compilation
1356      // of this arraycopy call site that both 'from' and 'to' addresses
1357      // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1358      //
1359      // Aligned arrays have 4 bytes alignment in 32-bits VM
1360      // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
1361      //
1362#ifndef _LP64
1363      // copy a 4-bytes word if necessary to align 'to' to 8 bytes
1364      __ andcc(to, 7, G0);
1365      __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment);
1366      __ delayed()->ld(from, 0, O3);
1367      __ inc(from, 4);
1368      __ inc(to, 4);
1369      __ dec(count, 4);
1370      __ st(O3, to, -4);
1371    __ BIND(L_skip_alignment);
1372#endif
1373    } else {
1374      // copy bytes to align 'to' on 8 byte boundary
1375      __ andcc(to, 7, G1); // misaligned bytes
1376      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1377      __ delayed()->neg(G1);
1378      __ inc(G1, 8);       // bytes need to copy to next 8-bytes alignment
1379      __ sub(count, G1, count);
1380    __ BIND(L_align);
1381      __ ldub(from, 0, O3);
1382      __ deccc(G1);
1383      __ inc(from);
1384      __ stb(O3, to, 0);
1385      __ br(Assembler::notZero, false, Assembler::pt, L_align);
1386      __ delayed()->inc(to);
1387    __ BIND(L_skip_alignment);
1388    }
1389#ifdef _LP64
1390    if (!aligned)
1391#endif
1392    {
1393      // Copy with shift 16 bytes per iteration if arrays do not have
1394      // the same alignment mod 8, otherwise fall through to the next
1395      // code for aligned copy.
1396      // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1397      // Also jump over aligned copy after the copy with shift completed.
1398
1399      copy_16_bytes_forward_with_shift(from, to, count, 16, L_copy_byte);
1400    }
1401
1402    // Both array are 8 bytes aligned, copy 16 bytes at a time
1403      __ and3(count, 7, G4); // Save count
1404      __ srl(count, 3, count);
1405     generate_disjoint_long_copy_core(aligned);
1406      __ mov(G4, count);     // Restore count
1407
1408    // copy tailing bytes
1409    __ BIND(L_copy_byte);
1410      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1411      __ delayed()->nop();
1412      __ align(16);
1413    __ BIND(L_copy_byte_loop);
1414      __ ldub(from, offset, O3);
1415      __ deccc(count);
1416      __ stb(O3, to, offset);
1417      __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1418      __ delayed()->inc(offset);
1419
1420    __ BIND(L_exit);
1421      // O3, O4 are used as temp registers
1422      inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1423      __ retl();
1424      __ delayed()->mov(G0, O0); // return 0
1425    return start;
1426  }
1427
1428  //
1429  //  Generate stub for conjoint byte copy.  If "aligned" is true, the
1430  //  "from" and "to" addresses are assumed to be heapword aligned.
1431  //
1432  // Arguments for generated stub:
1433  //      from:  O0
1434  //      to:    O1
1435  //      count: O2 treated as signed
1436  //
1437  address generate_conjoint_byte_copy(bool aligned, const char * name) {
1438    // Do reverse copy.
1439
1440    __ align(CodeEntryAlignment);
1441    StubCodeMark mark(this, "StubRoutines", name);
1442    address start = __ pc();
1443    address nooverlap_target = aligned ?
1444        StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
1445        disjoint_byte_copy_entry;
1446
1447    Label L_skip_alignment, L_align, L_aligned_copy;
1448    Label L_copy_byte, L_copy_byte_loop, L_exit;
1449
1450    const Register from      = O0;   // source array address
1451    const Register to        = O1;   // destination array address
1452    const Register count     = O2;   // elements count
1453    const Register end_from  = from; // source array end address
1454    const Register end_to    = to;   // destination array end address
1455
1456    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1457
1458    if (!aligned)  byte_copy_entry = __ pc();
1459    // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1460    if (!aligned)  BLOCK_COMMENT("Entry:");
1461
1462    array_overlap_test(nooverlap_target, 0);
1463
1464    __ add(to, count, end_to);       // offset after last copied element
1465
1466    // for short arrays, just do single element copy
1467    __ cmp(count, 23); // 16 + 7
1468    __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1469    __ delayed()->add(from, count, end_from);
1470
1471    {
1472      // Align end of arrays since they could be not aligned even
1473      // when arrays itself are aligned.
1474
1475      // copy bytes to align 'end_to' on 8 byte boundary
1476      __ andcc(end_to, 7, G1); // misaligned bytes
1477      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1478      __ delayed()->nop();
1479      __ sub(count, G1, count);
1480    __ BIND(L_align);
1481      __ dec(end_from);
1482      __ dec(end_to);
1483      __ ldub(end_from, 0, O3);
1484      __ deccc(G1);
1485      __ brx(Assembler::notZero, false, Assembler::pt, L_align);
1486      __ delayed()->stb(O3, end_to, 0);
1487    __ BIND(L_skip_alignment);
1488    }
1489#ifdef _LP64
1490    if (aligned) {
1491      // Both arrays are aligned to 8-bytes in 64-bits VM.
1492      // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1493      // in unaligned case.
1494      __ dec(count, 16);
1495    } else
1496#endif
1497    {
1498      // Copy with shift 16 bytes per iteration if arrays do not have
1499      // the same alignment mod 8, otherwise jump to the next
1500      // code for aligned copy (and substracting 16 from 'count' before jump).
1501      // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1502      // Also jump over aligned copy after the copy with shift completed.
1503
1504      copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
1505                                        L_aligned_copy, L_copy_byte);
1506    }
1507    // copy 4 elements (16 bytes) at a time
1508      __ align(16);
1509    __ BIND(L_aligned_copy);
1510      __ dec(end_from, 16);
1511      __ ldx(end_from, 8, O3);
1512      __ ldx(end_from, 0, O4);
1513      __ dec(end_to, 16);
1514      __ deccc(count, 16);
1515      __ stx(O3, end_to, 8);
1516      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1517      __ delayed()->stx(O4, end_to, 0);
1518      __ inc(count, 16);
1519
1520    // copy 1 element (2 bytes) at a time
1521    __ BIND(L_copy_byte);
1522      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1523      __ delayed()->nop();
1524      __ align(16);
1525    __ BIND(L_copy_byte_loop);
1526      __ dec(end_from);
1527      __ dec(end_to);
1528      __ ldub(end_from, 0, O4);
1529      __ deccc(count);
1530      __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
1531      __ delayed()->stb(O4, end_to, 0);
1532
1533    __ BIND(L_exit);
1534    // O3, O4 are used as temp registers
1535    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1536    __ retl();
1537    __ delayed()->mov(G0, O0); // return 0
1538    return start;
1539  }
1540
1541  //
1542  //  Generate stub for disjoint short copy.  If "aligned" is true, the
1543  //  "from" and "to" addresses are assumed to be heapword aligned.
1544  //
1545  // Arguments for generated stub:
1546  //      from:  O0
1547  //      to:    O1
1548  //      count: O2 treated as signed
1549  //
1550  address generate_disjoint_short_copy(bool aligned, const char * name) {
1551    __ align(CodeEntryAlignment);
1552    StubCodeMark mark(this, "StubRoutines", name);
1553    address start = __ pc();
1554
1555    Label L_skip_alignment, L_skip_alignment2;
1556    Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1557
1558    const Register from      = O0;   // source array address
1559    const Register to        = O1;   // destination array address
1560    const Register count     = O2;   // elements count
1561    const Register offset    = O5;   // offset from start of arrays
1562    // O3, O4, G3, G4 are used as temp registers
1563
1564    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1565
1566    if (!aligned)  disjoint_short_copy_entry = __ pc();
1567    // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1568    if (!aligned)  BLOCK_COMMENT("Entry:");
1569
1570    // for short arrays, just do single element copy
1571    __ cmp(count, 11); // 8 + 3  (22 bytes)
1572    __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1573    __ delayed()->mov(G0, offset);
1574
1575    if (aligned) {
1576      // 'aligned' == true when it is known statically during compilation
1577      // of this arraycopy call site that both 'from' and 'to' addresses
1578      // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1579      //
1580      // Aligned arrays have 4 bytes alignment in 32-bits VM
1581      // and 8 bytes - in 64-bits VM.
1582      //
1583#ifndef _LP64
1584      // copy a 2-elements word if necessary to align 'to' to 8 bytes
1585      __ andcc(to, 7, G0);
1586      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1587      __ delayed()->ld(from, 0, O3);
1588      __ inc(from, 4);
1589      __ inc(to, 4);
1590      __ dec(count, 2);
1591      __ st(O3, to, -4);
1592    __ BIND(L_skip_alignment);
1593#endif
1594    } else {
1595      // copy 1 element if necessary to align 'to' on an 4 bytes
1596      __ andcc(to, 3, G0);
1597      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1598      __ delayed()->lduh(from, 0, O3);
1599      __ inc(from, 2);
1600      __ inc(to, 2);
1601      __ dec(count);
1602      __ sth(O3, to, -2);
1603    __ BIND(L_skip_alignment);
1604
1605      // copy 2 elements to align 'to' on an 8 byte boundary
1606      __ andcc(to, 7, G0);
1607      __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1608      __ delayed()->lduh(from, 0, O3);
1609      __ dec(count, 2);
1610      __ lduh(from, 2, O4);
1611      __ inc(from, 4);
1612      __ inc(to, 4);
1613      __ sth(O3, to, -4);
1614      __ sth(O4, to, -2);
1615    __ BIND(L_skip_alignment2);
1616    }
1617#ifdef _LP64
1618    if (!aligned)
1619#endif
1620    {
1621      // Copy with shift 16 bytes per iteration if arrays do not have
1622      // the same alignment mod 8, otherwise fall through to the next
1623      // code for aligned copy.
1624      // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1625      // Also jump over aligned copy after the copy with shift completed.
1626
1627      copy_16_bytes_forward_with_shift(from, to, count, 8, L_copy_2_bytes);
1628    }
1629
1630    // Both array are 8 bytes aligned, copy 16 bytes at a time
1631      __ and3(count, 3, G4); // Save
1632      __ srl(count, 2, count);
1633     generate_disjoint_long_copy_core(aligned);
1634      __ mov(G4, count); // restore
1635
1636    // copy 1 element at a time
1637    __ BIND(L_copy_2_bytes);
1638      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1639      __ delayed()->nop();
1640      __ align(16);
1641    __ BIND(L_copy_2_bytes_loop);
1642      __ lduh(from, offset, O3);
1643      __ deccc(count);
1644      __ sth(O3, to, offset);
1645      __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1646      __ delayed()->inc(offset, 2);
1647
1648    __ BIND(L_exit);
1649      // O3, O4 are used as temp registers
1650      inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1651      __ retl();
1652      __ delayed()->mov(G0, O0); // return 0
1653    return start;
1654  }
1655
1656  //
1657  //  Generate stub for conjoint short copy.  If "aligned" is true, the
1658  //  "from" and "to" addresses are assumed to be heapword aligned.
1659  //
1660  // Arguments for generated stub:
1661  //      from:  O0
1662  //      to:    O1
1663  //      count: O2 treated as signed
1664  //
1665  address generate_conjoint_short_copy(bool aligned, const char * name) {
1666    // Do reverse copy.
1667
1668    __ align(CodeEntryAlignment);
1669    StubCodeMark mark(this, "StubRoutines", name);
1670    address start = __ pc();
1671    address nooverlap_target = aligned ?
1672        StubRoutines::arrayof_jshort_disjoint_arraycopy() :
1673        disjoint_short_copy_entry;
1674
1675    Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
1676    Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1677
1678    const Register from      = O0;   // source array address
1679    const Register to        = O1;   // destination array address
1680    const Register count     = O2;   // elements count
1681    const Register end_from  = from; // source array end address
1682    const Register end_to    = to;   // destination array end address
1683
1684    const Register byte_count = O3;  // bytes count to copy
1685
1686    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1687
1688    if (!aligned)  short_copy_entry = __ pc();
1689    // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1690    if (!aligned)  BLOCK_COMMENT("Entry:");
1691
1692    array_overlap_test(nooverlap_target, 1);
1693
1694    __ sllx(count, LogBytesPerShort, byte_count);
1695    __ add(to, byte_count, end_to);  // offset after last copied element
1696
1697    // for short arrays, just do single element copy
1698    __ cmp(count, 11); // 8 + 3  (22 bytes)
1699    __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1700    __ delayed()->add(from, byte_count, end_from);
1701
1702    {
1703      // Align end of arrays since they could be not aligned even
1704      // when arrays itself are aligned.
1705
1706      // copy 1 element if necessary to align 'end_to' on an 4 bytes
1707      __ andcc(end_to, 3, G0);
1708      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1709      __ delayed()->lduh(end_from, -2, O3);
1710      __ dec(end_from, 2);
1711      __ dec(end_to, 2);
1712      __ dec(count);
1713      __ sth(O3, end_to, 0);
1714    __ BIND(L_skip_alignment);
1715
1716      // copy 2 elements to align 'end_to' on an 8 byte boundary
1717      __ andcc(end_to, 7, G0);
1718      __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1719      __ delayed()->lduh(end_from, -2, O3);
1720      __ dec(count, 2);
1721      __ lduh(end_from, -4, O4);
1722      __ dec(end_from, 4);
1723      __ dec(end_to, 4);
1724      __ sth(O3, end_to, 2);
1725      __ sth(O4, end_to, 0);
1726    __ BIND(L_skip_alignment2);
1727    }
1728#ifdef _LP64
1729    if (aligned) {
1730      // Both arrays are aligned to 8-bytes in 64-bits VM.
1731      // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1732      // in unaligned case.
1733      __ dec(count, 8);
1734    } else
1735#endif
1736    {
1737      // Copy with shift 16 bytes per iteration if arrays do not have
1738      // the same alignment mod 8, otherwise jump to the next
1739      // code for aligned copy (and substracting 8 from 'count' before jump).
1740      // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1741      // Also jump over aligned copy after the copy with shift completed.
1742
1743      copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
1744                                        L_aligned_copy, L_copy_2_bytes);
1745    }
1746    // copy 4 elements (16 bytes) at a time
1747      __ align(16);
1748    __ BIND(L_aligned_copy);
1749      __ dec(end_from, 16);
1750      __ ldx(end_from, 8, O3);
1751      __ ldx(end_from, 0, O4);
1752      __ dec(end_to, 16);
1753      __ deccc(count, 8);
1754      __ stx(O3, end_to, 8);
1755      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1756      __ delayed()->stx(O4, end_to, 0);
1757      __ inc(count, 8);
1758
1759    // copy 1 element (2 bytes) at a time
1760    __ BIND(L_copy_2_bytes);
1761      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1762      __ delayed()->nop();
1763    __ BIND(L_copy_2_bytes_loop);
1764      __ dec(end_from, 2);
1765      __ dec(end_to, 2);
1766      __ lduh(end_from, 0, O4);
1767      __ deccc(count);
1768      __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
1769      __ delayed()->sth(O4, end_to, 0);
1770
1771    __ BIND(L_exit);
1772    // O3, O4 are used as temp registers
1773    inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1774    __ retl();
1775    __ delayed()->mov(G0, O0); // return 0
1776    return start;
1777  }
1778
1779  //
1780  //  Generate core code for disjoint int copy (and oop copy on 32-bit).
1781  //  If "aligned" is true, the "from" and "to" addresses are assumed
1782  //  to be heapword aligned.
1783  //
1784  // Arguments:
1785  //      from:  O0
1786  //      to:    O1
1787  //      count: O2 treated as signed
1788  //
1789  void generate_disjoint_int_copy_core(bool aligned) {
1790
1791    Label L_skip_alignment, L_aligned_copy;
1792    Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
1793
1794    const Register from      = O0;   // source array address
1795    const Register to        = O1;   // destination array address
1796    const Register count     = O2;   // elements count
1797    const Register offset    = O5;   // offset from start of arrays
1798    // O3, O4, G3, G4 are used as temp registers
1799
1800    // 'aligned' == true when it is known statically during compilation
1801    // of this arraycopy call site that both 'from' and 'to' addresses
1802    // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1803    //
1804    // Aligned arrays have 4 bytes alignment in 32-bits VM
1805    // and 8 bytes - in 64-bits VM.
1806    //
1807#ifdef _LP64
1808    if (!aligned)
1809#endif
1810    {
1811      // The next check could be put under 'ifndef' since the code in
1812      // generate_disjoint_long_copy_core() has own checks and set 'offset'.
1813
1814      // for short arrays, just do single element copy
1815      __ cmp(count, 5); // 4 + 1 (20 bytes)
1816      __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
1817      __ delayed()->mov(G0, offset);
1818
1819      // copy 1 element to align 'to' on an 8 byte boundary
1820      __ andcc(to, 7, G0);
1821      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1822      __ delayed()->ld(from, 0, O3);
1823      __ inc(from, 4);
1824      __ inc(to, 4);
1825      __ dec(count);
1826      __ st(O3, to, -4);
1827    __ BIND(L_skip_alignment);
1828
1829    // if arrays have same alignment mod 8, do 4 elements copy
1830      __ andcc(from, 7, G0);
1831      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1832      __ delayed()->ld(from, 0, O3);
1833
1834    //
1835    // Load 2 aligned 8-bytes chunks and use one from previous iteration
1836    // to form 2 aligned 8-bytes chunks to store.
1837    //
1838    // copy_16_bytes_forward_with_shift() is not used here since this
1839    // code is more optimal.
1840
1841    // copy with shift 4 elements (16 bytes) at a time
1842      __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
1843
1844      __ align(16);
1845    __ BIND(L_copy_16_bytes);
1846      __ ldx(from, 4, O4);
1847      __ deccc(count, 4); // Can we do next iteration after this one?
1848      __ ldx(from, 12, G4);
1849      __ inc(to, 16);
1850      __ inc(from, 16);
1851      __ sllx(O3, 32, O3);
1852      __ srlx(O4, 32, G3);
1853      __ bset(G3, O3);
1854      __ stx(O3, to, -16);
1855      __ sllx(O4, 32, O4);
1856      __ srlx(G4, 32, G3);
1857      __ bset(G3, O4);
1858      __ stx(O4, to, -8);
1859      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
1860      __ delayed()->mov(G4, O3);
1861
1862      __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
1863      __ delayed()->inc(count, 4); // restore 'count'
1864
1865    __ BIND(L_aligned_copy);
1866    }
1867    // copy 4 elements (16 bytes) at a time
1868      __ and3(count, 1, G4); // Save
1869      __ srl(count, 1, count);
1870     generate_disjoint_long_copy_core(aligned);
1871      __ mov(G4, count);     // Restore
1872
1873    // copy 1 element at a time
1874    __ BIND(L_copy_4_bytes);
1875      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1876      __ delayed()->nop();
1877    __ BIND(L_copy_4_bytes_loop);
1878      __ ld(from, offset, O3);
1879      __ deccc(count);
1880      __ st(O3, to, offset);
1881      __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
1882      __ delayed()->inc(offset, 4);
1883    __ BIND(L_exit);
1884  }
1885
1886  //
1887  //  Generate stub for disjoint int copy.  If "aligned" is true, the
1888  //  "from" and "to" addresses are assumed to be heapword aligned.
1889  //
1890  // Arguments for generated stub:
1891  //      from:  O0
1892  //      to:    O1
1893  //      count: O2 treated as signed
1894  //
1895  address generate_disjoint_int_copy(bool aligned, const char * name) {
1896    __ align(CodeEntryAlignment);
1897    StubCodeMark mark(this, "StubRoutines", name);
1898    address start = __ pc();
1899
1900    const Register count = O2;
1901    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1902
1903    if (!aligned)  disjoint_int_copy_entry = __ pc();
1904    // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1905    if (!aligned)  BLOCK_COMMENT("Entry:");
1906
1907    generate_disjoint_int_copy_core(aligned);
1908
1909    // O3, O4 are used as temp registers
1910    inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
1911    __ retl();
1912    __ delayed()->mov(G0, O0); // return 0
1913    return start;
1914  }
1915
1916  //
1917  //  Generate core code for conjoint int copy (and oop copy on 32-bit).
1918  //  If "aligned" is true, the "from" and "to" addresses are assumed
1919  //  to be heapword aligned.
1920  //
1921  // Arguments:
1922  //      from:  O0
1923  //      to:    O1
1924  //      count: O2 treated as signed
1925  //
1926  void generate_conjoint_int_copy_core(bool aligned) {
1927    // Do reverse copy.
1928
1929    Label L_skip_alignment, L_aligned_copy;
1930    Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
1931
1932    const Register from      = O0;   // source array address
1933    const Register to        = O1;   // destination array address
1934    const Register count     = O2;   // elements count
1935    const Register end_from  = from; // source array end address
1936    const Register end_to    = to;   // destination array end address
1937    // O3, O4, O5, G3 are used as temp registers
1938
1939    const Register byte_count = O3;  // bytes count to copy
1940
1941      __ sllx(count, LogBytesPerInt, byte_count);
1942      __ add(to, byte_count, end_to); // offset after last copied element
1943
1944      __ cmp(count, 5); // for short arrays, just do single element copy
1945      __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
1946      __ delayed()->add(from, byte_count, end_from);
1947
1948    // copy 1 element to align 'to' on an 8 byte boundary
1949      __ andcc(end_to, 7, G0);
1950      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1951      __ delayed()->nop();
1952      __ dec(count);
1953      __ dec(end_from, 4);
1954      __ dec(end_to,   4);
1955      __ ld(end_from, 0, O4);
1956      __ st(O4, end_to, 0);
1957    __ BIND(L_skip_alignment);
1958
1959    // Check if 'end_from' and 'end_to' has the same alignment.
1960      __ andcc(end_from, 7, G0);
1961      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1962      __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
1963
1964    // copy with shift 4 elements (16 bytes) at a time
1965    //
1966    // Load 2 aligned 8-bytes chunks and use one from previous iteration
1967    // to form 2 aligned 8-bytes chunks to store.
1968    //
1969      __ ldx(end_from, -4, O3);
1970      __ align(16);
1971    __ BIND(L_copy_16_bytes);
1972      __ ldx(end_from, -12, O4);
1973      __ deccc(count, 4);
1974      __ ldx(end_from, -20, O5);
1975      __ dec(end_to, 16);
1976      __ dec(end_from, 16);
1977      __ srlx(O3, 32, O3);
1978      __ sllx(O4, 32, G3);
1979      __ bset(G3, O3);
1980      __ stx(O3, end_to, 8);
1981      __ srlx(O4, 32, O4);
1982      __ sllx(O5, 32, G3);
1983      __ bset(O4, G3);
1984      __ stx(G3, end_to, 0);
1985      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
1986      __ delayed()->mov(O5, O3);
1987
1988      __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
1989      __ delayed()->inc(count, 4);
1990
1991    // copy 4 elements (16 bytes) at a time
1992      __ align(16);
1993    __ BIND(L_aligned_copy);
1994      __ dec(end_from, 16);
1995      __ ldx(end_from, 8, O3);
1996      __ ldx(end_from, 0, O4);
1997      __ dec(end_to, 16);
1998      __ deccc(count, 4);
1999      __ stx(O3, end_to, 8);
2000      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2001      __ delayed()->stx(O4, end_to, 0);
2002      __ inc(count, 4);
2003
2004    // copy 1 element (4 bytes) at a time
2005    __ BIND(L_copy_4_bytes);
2006      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
2007      __ delayed()->nop();
2008    __ BIND(L_copy_4_bytes_loop);
2009      __ dec(end_from, 4);
2010      __ dec(end_to, 4);
2011      __ ld(end_from, 0, O4);
2012      __ deccc(count);
2013      __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
2014      __ delayed()->st(O4, end_to, 0);
2015    __ BIND(L_exit);
2016  }
2017
2018  //
2019  //  Generate stub for conjoint int copy.  If "aligned" is true, the
2020  //  "from" and "to" addresses are assumed to be heapword aligned.
2021  //
2022  // Arguments for generated stub:
2023  //      from:  O0
2024  //      to:    O1
2025  //      count: O2 treated as signed
2026  //
2027  address generate_conjoint_int_copy(bool aligned, const char * name) {
2028    __ align(CodeEntryAlignment);
2029    StubCodeMark mark(this, "StubRoutines", name);
2030    address start = __ pc();
2031
2032    address nooverlap_target = aligned ?
2033        StubRoutines::arrayof_jint_disjoint_arraycopy() :
2034        disjoint_int_copy_entry;
2035
2036    assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2037
2038    if (!aligned)  int_copy_entry = __ pc();
2039    // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2040    if (!aligned)  BLOCK_COMMENT("Entry:");
2041
2042    array_overlap_test(nooverlap_target, 2);
2043
2044    generate_conjoint_int_copy_core(aligned);
2045
2046    // O3, O4 are used as temp registers
2047    inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2048    __ retl();
2049    __ delayed()->mov(G0, O0); // return 0
2050    return start;
2051  }
2052
2053  //
2054  //  Generate core code for disjoint long copy (and oop copy on 64-bit).
2055  //  "aligned" is ignored, because we must make the stronger
2056  //  assumption that both addresses are always 64-bit aligned.
2057  //
2058  // Arguments:
2059  //      from:  O0
2060  //      to:    O1
2061  //      count: O2 treated as signed
2062  //
2063  void generate_disjoint_long_copy_core(bool aligned) {
2064    Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2065    const Register from    = O0;  // source array address
2066    const Register to      = O1;  // destination array address
2067    const Register count   = O2;  // elements count
2068    const Register offset0 = O4;  // element offset
2069    const Register offset8 = O5;  // next element offset
2070
2071      __ deccc(count, 2);
2072      __ mov(G0, offset0);   // offset from start of arrays (0)
2073      __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2074      __ delayed()->add(offset0, 8, offset8);
2075      __ align(16);
2076    __ BIND(L_copy_16_bytes);
2077      __ ldx(from, offset0, O3);
2078      __ ldx(from, offset8, G3);
2079      __ deccc(count, 2);
2080      __ stx(O3, to, offset0);
2081      __ inc(offset0, 16);
2082      __ stx(G3, to, offset8);
2083      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2084      __ delayed()->inc(offset8, 16);
2085
2086    __ BIND(L_copy_8_bytes);
2087      __ inccc(count, 2);
2088      __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2089      __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
2090      __ ldx(from, offset0, O3);
2091      __ stx(O3, to, offset0);
2092    __ BIND(L_exit);
2093  }
2094
2095  //
2096  //  Generate stub for disjoint long copy.
2097  //  "aligned" is ignored, because we must make the stronger
2098  //  assumption that both addresses are always 64-bit aligned.
2099  //
2100  // Arguments for generated stub:
2101  //      from:  O0
2102  //      to:    O1
2103  //      count: O2 treated as signed
2104  //
2105  address generate_disjoint_long_copy(bool aligned, const char * name) {
2106    __ align(CodeEntryAlignment);
2107    StubCodeMark mark(this, "StubRoutines", name);
2108    address start = __ pc();
2109
2110    assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2111
2112    if (!aligned)  disjoint_long_copy_entry = __ pc();
2113    // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2114    if (!aligned)  BLOCK_COMMENT("Entry:");
2115
2116    generate_disjoint_long_copy_core(aligned);
2117
2118    // O3, O4 are used as temp registers
2119    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2120    __ retl();
2121    __ delayed()->mov(G0, O0); // return 0
2122    return start;
2123  }
2124
2125  //
2126  //  Generate core code for conjoint long copy (and oop copy on 64-bit).
2127  //  "aligned" is ignored, because we must make the stronger
2128  //  assumption that both addresses are always 64-bit aligned.
2129  //
2130  // Arguments:
2131  //      from:  O0
2132  //      to:    O1
2133  //      count: O2 treated as signed
2134  //
2135  void generate_conjoint_long_copy_core(bool aligned) {
2136    // Do reverse copy.
2137    Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2138    const Register from    = O0;  // source array address
2139    const Register to      = O1;  // destination array address
2140    const Register count   = O2;  // elements count
2141    const Register offset8 = O4;  // element offset
2142    const Register offset0 = O5;  // previous element offset
2143
2144      __ subcc(count, 1, count);
2145      __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
2146      __ delayed()->sllx(count, LogBytesPerLong, offset8);
2147      __ sub(offset8, 8, offset0);
2148      __ align(16);
2149    __ BIND(L_copy_16_bytes);
2150      __ ldx(from, offset8, O2);
2151      __ ldx(from, offset0, O3);
2152      __ stx(O2, to, offset8);
2153      __ deccc(offset8, 16);      // use offset8 as counter
2154      __ stx(O3, to, offset0);
2155      __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
2156      __ delayed()->dec(offset0, 16);
2157
2158    __ BIND(L_copy_8_bytes);
2159      __ brx(Assembler::negative, false, Assembler::pn, L_exit );
2160      __ delayed()->nop();
2161      __ ldx(from, 0, O3);
2162      __ stx(O3, to, 0);
2163    __ BIND(L_exit);
2164  }
2165
2166  //  Generate stub for conjoint long copy.
2167  //  "aligned" is ignored, because we must make the stronger
2168  //  assumption that both addresses are always 64-bit aligned.
2169  //
2170  // Arguments for generated stub:
2171  //      from:  O0
2172  //      to:    O1
2173  //      count: O2 treated as signed
2174  //
2175  address generate_conjoint_long_copy(bool aligned, const char * name) {
2176    __ align(CodeEntryAlignment);
2177    StubCodeMark mark(this, "StubRoutines", name);
2178    address start = __ pc();
2179
2180    assert(!aligned, "usage");
2181    address nooverlap_target = disjoint_long_copy_entry;
2182
2183    assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2184
2185    if (!aligned)  long_copy_entry = __ pc();
2186    // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2187    if (!aligned)  BLOCK_COMMENT("Entry:");
2188
2189    array_overlap_test(nooverlap_target, 3);
2190
2191    generate_conjoint_long_copy_core(aligned);
2192
2193    // O3, O4 are used as temp registers
2194    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2195    __ retl();
2196    __ delayed()->mov(G0, O0); // return 0
2197    return start;
2198  }
2199
2200  //  Generate stub for disjoint oop copy.  If "aligned" is true, the
2201  //  "from" and "to" addresses are assumed to be heapword aligned.
2202  //
2203  // Arguments for generated stub:
2204  //      from:  O0
2205  //      to:    O1
2206  //      count: O2 treated as signed
2207  //
2208  address generate_disjoint_oop_copy(bool aligned, const char * name) {
2209
2210    const Register from  = O0;  // source array address
2211    const Register to    = O1;  // destination array address
2212    const Register count = O2;  // elements count
2213
2214    __ align(CodeEntryAlignment);
2215    StubCodeMark mark(this, "StubRoutines", name);
2216    address start = __ pc();
2217
2218    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2219
2220    if (!aligned)  disjoint_oop_copy_entry = __ pc();
2221    // caller can pass a 64-bit byte count here
2222    if (!aligned)  BLOCK_COMMENT("Entry:");
2223
2224    // save arguments for barrier generation
2225    __ mov(to, G1);
2226    __ mov(count, G5);
2227    gen_write_ref_array_pre_barrier(G1, G5);
2228  #ifdef _LP64
2229    generate_disjoint_long_copy_core(aligned);
2230  #else
2231    generate_disjoint_int_copy_core(aligned);
2232  #endif
2233    // O0 is used as temp register
2234    gen_write_ref_array_post_barrier(G1, G5, O0);
2235
2236    // O3, O4 are used as temp registers
2237    inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2238    __ retl();
2239    __ delayed()->mov(G0, O0); // return 0
2240    return start;
2241  }
2242
2243  //  Generate stub for conjoint oop copy.  If "aligned" is true, the
2244  //  "from" and "to" addresses are assumed to be heapword aligned.
2245  //
2246  // Arguments for generated stub:
2247  //      from:  O0
2248  //      to:    O1
2249  //      count: O2 treated as signed
2250  //
2251  address generate_conjoint_oop_copy(bool aligned, const char * name) {
2252
2253    const Register from  = O0;  // source array address
2254    const Register to    = O1;  // destination array address
2255    const Register count = O2;  // elements count
2256
2257    __ align(CodeEntryAlignment);
2258    StubCodeMark mark(this, "StubRoutines", name);
2259    address start = __ pc();
2260
2261    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2262
2263    if (!aligned)  oop_copy_entry = __ pc();
2264    // caller can pass a 64-bit byte count here
2265    if (!aligned)  BLOCK_COMMENT("Entry:");
2266
2267    // save arguments for barrier generation
2268    __ mov(to, G1);
2269    __ mov(count, G5);
2270
2271    gen_write_ref_array_pre_barrier(G1, G5);
2272
2273    address nooverlap_target = aligned ?
2274        StubRoutines::arrayof_oop_disjoint_arraycopy() :
2275        disjoint_oop_copy_entry;
2276
2277    array_overlap_test(nooverlap_target, LogBytesPerWord);
2278
2279  #ifdef _LP64
2280    generate_conjoint_long_copy_core(aligned);
2281  #else
2282    generate_conjoint_int_copy_core(aligned);
2283  #endif
2284
2285    // O0 is used as temp register
2286    gen_write_ref_array_post_barrier(G1, G5, O0);
2287
2288    // O3, O4 are used as temp registers
2289    inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2290    __ retl();
2291    __ delayed()->mov(G0, O0); // return 0
2292    return start;
2293  }
2294
2295
2296  // Helper for generating a dynamic type check.
2297  // Smashes only the given temp registers.
2298  void generate_type_check(Register sub_klass,
2299                           Register super_check_offset,
2300                           Register super_klass,
2301                           Register temp,
2302                           Label& L_success,
2303                           Register deccc_hack = noreg) {
2304    assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
2305
2306    BLOCK_COMMENT("type_check:");
2307
2308    Label L_miss;
2309
2310    assert_clean_int(super_check_offset, temp);
2311
2312    // maybe decrement caller's trip count:
2313#define DELAY_SLOT delayed();   \
2314    { if (deccc_hack == noreg) __ nop(); else __ deccc(deccc_hack); }
2315
2316    // if the pointers are equal, we are done (e.g., String[] elements)
2317    __ cmp(sub_klass, super_klass);
2318    __ brx(Assembler::equal, true, Assembler::pt, L_success);
2319    __ DELAY_SLOT;
2320
2321    // check the supertype display:
2322    __ ld_ptr(sub_klass, super_check_offset, temp); // query the super type
2323    __ cmp(super_klass,                      temp); // test the super type
2324    __ brx(Assembler::equal, true, Assembler::pt, L_success);
2325    __ DELAY_SLOT;
2326
2327    int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
2328                     Klass::secondary_super_cache_offset_in_bytes());
2329    __ cmp(super_klass, sc_offset);
2330    __ brx(Assembler::notEqual, true, Assembler::pt, L_miss);
2331    __ delayed()->nop();
2332
2333    __ save_frame(0);
2334    __ mov(sub_klass->after_save(), O1);
2335    // mov(super_klass->after_save(), O2); //fill delay slot
2336    assert(StubRoutines::Sparc::_partial_subtype_check != NULL, "order of generation");
2337    __ call(StubRoutines::Sparc::_partial_subtype_check);
2338    __ delayed()->mov(super_klass->after_save(), O2);
2339    __ restore();
2340
2341    // Upon return, the condition codes are already set.
2342    __ brx(Assembler::equal, true, Assembler::pt, L_success);
2343    __ DELAY_SLOT;
2344
2345#undef DELAY_SLOT
2346
2347    // Fall through on failure!
2348    __ BIND(L_miss);
2349  }
2350
2351
2352  //  Generate stub for checked oop copy.
2353  //
2354  // Arguments for generated stub:
2355  //      from:  O0
2356  //      to:    O1
2357  //      count: O2 treated as signed
2358  //      ckoff: O3 (super_check_offset)
2359  //      ckval: O4 (super_klass)
2360  //      ret:   O0 zero for success; (-1^K) where K is partial transfer count
2361  //
2362  address generate_checkcast_copy(const char* name) {
2363
2364    const Register O0_from   = O0;      // source array address
2365    const Register O1_to     = O1;      // destination array address
2366    const Register O2_count  = O2;      // elements count
2367    const Register O3_ckoff  = O3;      // super_check_offset
2368    const Register O4_ckval  = O4;      // super_klass
2369
2370    const Register O5_offset = O5;      // loop var, with stride wordSize
2371    const Register G1_remain = G1;      // loop var, with stride -1
2372    const Register G3_oop    = G3;      // actual oop copied
2373    const Register G4_klass  = G4;      // oop._klass
2374    const Register G5_super  = G5;      // oop._klass._primary_supers[ckval]
2375
2376    __ align(CodeEntryAlignment);
2377    StubCodeMark mark(this, "StubRoutines", name);
2378    address start = __ pc();
2379
2380    int klass_off = oopDesc::klass_offset_in_bytes();
2381
2382    gen_write_ref_array_pre_barrier(G1, G5);
2383
2384
2385#ifdef ASSERT
2386    // We sometimes save a frame (see partial_subtype_check below).
2387    // If this will cause trouble, let's fail now instead of later.
2388    __ save_frame(0);
2389    __ restore();
2390#endif
2391
2392#ifdef ASSERT
2393    // caller guarantees that the arrays really are different
2394    // otherwise, we would have to make conjoint checks
2395    { Label L;
2396      __ mov(O3, G1);           // spill: overlap test smashes O3
2397      __ mov(O4, G4);           // spill: overlap test smashes O4
2398      array_overlap_test(L, LogBytesPerWord);
2399      __ stop("checkcast_copy within a single array");
2400      __ bind(L);
2401      __ mov(G1, O3);
2402      __ mov(G4, O4);
2403    }
2404#endif //ASSERT
2405
2406    assert_clean_int(O2_count, G1);     // Make sure 'count' is clean int.
2407
2408    checkcast_copy_entry = __ pc();
2409    // caller can pass a 64-bit byte count here (from generic stub)
2410    BLOCK_COMMENT("Entry:");
2411
2412    Label load_element, store_element, do_card_marks, fail, done;
2413    __ addcc(O2_count, 0, G1_remain);   // initialize loop index, and test it
2414    __ brx(Assembler::notZero, false, Assembler::pt, load_element);
2415    __ delayed()->mov(G0, O5_offset);   // offset from start of arrays
2416
2417    // Empty array:  Nothing to do.
2418    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2419    __ retl();
2420    __ delayed()->set(0, O0);           // return 0 on (trivial) success
2421
2422    // ======== begin loop ========
2423    // (Loop is rotated; its entry is load_element.)
2424    // Loop variables:
2425    //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
2426    //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
2427    //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
2428    __ align(16);
2429
2430    __ bind(store_element);
2431    // deccc(G1_remain);                // decrement the count (hoisted)
2432    __ st_ptr(G3_oop, O1_to, O5_offset); // store the oop
2433    __ inc(O5_offset, wordSize);        // step to next offset
2434    __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
2435    __ delayed()->set(0, O0);           // return -1 on success
2436
2437    // ======== loop entry is here ========
2438    __ bind(load_element);
2439    __ ld_ptr(O0_from, O5_offset, G3_oop);  // load the oop
2440    __ br_null(G3_oop, true, Assembler::pt, store_element);
2441    __ delayed()->deccc(G1_remain);     // decrement the count
2442
2443    __ ld_ptr(G3_oop, klass_off, G4_klass); // query the object klass
2444
2445    generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
2446                        // branch to this on success:
2447                        store_element,
2448                        // decrement this on success:
2449                        G1_remain);
2450    // ======== end loop ========
2451
2452    // It was a real error; we must depend on the caller to finish the job.
2453    // Register G1 has number of *remaining* oops, O2 number of *total* oops.
2454    // Emit GC store barriers for the oops we have copied (O2 minus G1),
2455    // and report their number to the caller.
2456    __ bind(fail);
2457    __ subcc(O2_count, G1_remain, O2_count);
2458    __ brx(Assembler::zero, false, Assembler::pt, done);
2459    __ delayed()->not1(O2_count, O0);   // report (-1^K) to caller
2460
2461    __ bind(do_card_marks);
2462    gen_write_ref_array_post_barrier(O1_to, O2_count, O3);   // store check on O1[0..O2]
2463
2464    __ bind(done);
2465    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2466    __ retl();
2467    __ delayed()->nop();             // return value in 00
2468
2469    return start;
2470  }
2471
2472
2473  //  Generate 'unsafe' array copy stub
2474  //  Though just as safe as the other stubs, it takes an unscaled
2475  //  size_t argument instead of an element count.
2476  //
2477  // Arguments for generated stub:
2478  //      from:  O0
2479  //      to:    O1
2480  //      count: O2 byte count, treated as ssize_t, can be zero
2481  //
2482  // Examines the alignment of the operands and dispatches
2483  // to a long, int, short, or byte copy loop.
2484  //
2485  address generate_unsafe_copy(const char* name) {
2486
2487    const Register O0_from   = O0;      // source array address
2488    const Register O1_to     = O1;      // destination array address
2489    const Register O2_count  = O2;      // elements count
2490
2491    const Register G1_bits   = G1;      // test copy of low bits
2492
2493    __ align(CodeEntryAlignment);
2494    StubCodeMark mark(this, "StubRoutines", name);
2495    address start = __ pc();
2496
2497    // bump this on entry, not on exit:
2498    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
2499
2500    __ or3(O0_from, O1_to, G1_bits);
2501    __ or3(O2_count,       G1_bits, G1_bits);
2502
2503    __ btst(BytesPerLong-1, G1_bits);
2504    __ br(Assembler::zero, true, Assembler::pt,
2505          long_copy_entry, relocInfo::runtime_call_type);
2506    // scale the count on the way out:
2507    __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
2508
2509    __ btst(BytesPerInt-1, G1_bits);
2510    __ br(Assembler::zero, true, Assembler::pt,
2511          int_copy_entry, relocInfo::runtime_call_type);
2512    // scale the count on the way out:
2513    __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
2514
2515    __ btst(BytesPerShort-1, G1_bits);
2516    __ br(Assembler::zero, true, Assembler::pt,
2517          short_copy_entry, relocInfo::runtime_call_type);
2518    // scale the count on the way out:
2519    __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
2520
2521    __ br(Assembler::always, false, Assembler::pt,
2522          byte_copy_entry, relocInfo::runtime_call_type);
2523    __ delayed()->nop();
2524
2525    return start;
2526  }
2527
2528
2529  // Perform range checks on the proposed arraycopy.
2530  // Kills the two temps, but nothing else.
2531  // Also, clean the sign bits of src_pos and dst_pos.
2532  void arraycopy_range_checks(Register src,     // source array oop (O0)
2533                              Register src_pos, // source position (O1)
2534                              Register dst,     // destination array oo (O2)
2535                              Register dst_pos, // destination position (O3)
2536                              Register length,  // length of copy (O4)
2537                              Register temp1, Register temp2,
2538                              Label& L_failed) {
2539    BLOCK_COMMENT("arraycopy_range_checks:");
2540
2541    //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2542
2543    const Register array_length = temp1;  // scratch
2544    const Register end_pos      = temp2;  // scratch
2545
2546    // Note:  This next instruction may be in the delay slot of a branch:
2547    __ add(length, src_pos, end_pos);  // src_pos + length
2548    __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
2549    __ cmp(end_pos, array_length);
2550    __ br(Assembler::greater, false, Assembler::pn, L_failed);
2551
2552    //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2553    __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
2554    __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
2555    __ cmp(end_pos, array_length);
2556    __ br(Assembler::greater, false, Assembler::pn, L_failed);
2557
2558    // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2559    // Move with sign extension can be used since they are positive.
2560    __ delayed()->signx(src_pos, src_pos);
2561    __ signx(dst_pos, dst_pos);
2562
2563    BLOCK_COMMENT("arraycopy_range_checks done");
2564  }
2565
2566
2567  //
2568  //  Generate generic array copy stubs
2569  //
2570  //  Input:
2571  //    O0    -  src oop
2572  //    O1    -  src_pos
2573  //    O2    -  dst oop
2574  //    O3    -  dst_pos
2575  //    O4    -  element count
2576  //
2577  //  Output:
2578  //    O0 ==  0  -  success
2579  //    O0 == -1  -  need to call System.arraycopy
2580  //
2581  address generate_generic_copy(const char *name) {
2582
2583    Label L_failed, L_objArray;
2584
2585    // Input registers
2586    const Register src      = O0;  // source array oop
2587    const Register src_pos  = O1;  // source position
2588    const Register dst      = O2;  // destination array oop
2589    const Register dst_pos  = O3;  // destination position
2590    const Register length   = O4;  // elements count
2591
2592    // registers used as temp
2593    const Register G3_src_klass = G3; // source array klass
2594    const Register G4_dst_klass = G4; // destination array klass
2595    const Register G5_lh        = G5; // layout handler
2596    const Register O5_temp      = O5;
2597
2598    __ align(CodeEntryAlignment);
2599    StubCodeMark mark(this, "StubRoutines", name);
2600    address start = __ pc();
2601
2602    // bump this on entry, not on exit:
2603    inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
2604
2605    // In principle, the int arguments could be dirty.
2606    //assert_clean_int(src_pos, G1);
2607    //assert_clean_int(dst_pos, G1);
2608    //assert_clean_int(length, G1);
2609
2610    //-----------------------------------------------------------------------
2611    // Assembler stubs will be used for this call to arraycopy
2612    // if the following conditions are met:
2613    //
2614    // (1) src and dst must not be null.
2615    // (2) src_pos must not be negative.
2616    // (3) dst_pos must not be negative.
2617    // (4) length  must not be negative.
2618    // (5) src klass and dst klass should be the same and not NULL.
2619    // (6) src and dst should be arrays.
2620    // (7) src_pos + length must not exceed length of src.
2621    // (8) dst_pos + length must not exceed length of dst.
2622    BLOCK_COMMENT("arraycopy initial argument checks");
2623
2624    //  if (src == NULL) return -1;
2625    __ br_null(src, false, Assembler::pn, L_failed);
2626
2627    //  if (src_pos < 0) return -1;
2628    __ delayed()->tst(src_pos);
2629    __ br(Assembler::negative, false, Assembler::pn, L_failed);
2630    __ delayed()->nop();
2631
2632    //  if (dst == NULL) return -1;
2633    __ br_null(dst, false, Assembler::pn, L_failed);
2634
2635    //  if (dst_pos < 0) return -1;
2636    __ delayed()->tst(dst_pos);
2637    __ br(Assembler::negative, false, Assembler::pn, L_failed);
2638
2639    //  if (length < 0) return -1;
2640    __ delayed()->tst(length);
2641    __ br(Assembler::negative, false, Assembler::pn, L_failed);
2642
2643    BLOCK_COMMENT("arraycopy argument klass checks");
2644    //  get src->klass()
2645    __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
2646
2647#ifdef ASSERT
2648    //  assert(src->klass() != NULL);
2649    BLOCK_COMMENT("assert klasses not null");
2650    { Label L_a, L_b;
2651      __ br_notnull(G3_src_klass, false, Assembler::pt, L_b); // it is broken if klass is NULL
2652      __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
2653      __ bind(L_a);
2654      __ stop("broken null klass");
2655      __ bind(L_b);
2656      __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
2657      __ delayed()->mov(G0, G4_dst_klass);      // scribble the temp
2658      BLOCK_COMMENT("assert done");
2659    }
2660#endif
2661
2662    // Load layout helper
2663    //
2664    //  |array_tag|     | header_size | element_type |     |log2_element_size|
2665    // 32        30    24            16              8     2                 0
2666    //
2667    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2668    //
2669
2670    int lh_offset = klassOopDesc::header_size() * HeapWordSize +
2671                    Klass::layout_helper_offset_in_bytes();
2672
2673    // Load 32-bits signed value. Use br() instruction with it to check icc.
2674    __ lduw(G3_src_klass, lh_offset, G5_lh);
2675
2676    // Handle objArrays completely differently...
2677    juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2678    __ set(objArray_lh, O5_temp);
2679    __ cmp(G5_lh,       O5_temp);
2680    __ br(Assembler::equal, false, Assembler::pt, L_objArray);
2681    __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
2682
2683    //  if (src->klass() != dst->klass()) return -1;
2684    __ cmp(G3_src_klass, G4_dst_klass);
2685    __ brx(Assembler::notEqual, false, Assembler::pn, L_failed);
2686    __ delayed()->nop();
2687
2688    //  if (!src->is_Array()) return -1;
2689    __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
2690    __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
2691
2692    // At this point, it is known to be a typeArray (array_tag 0x3).
2693#ifdef ASSERT
2694    __ delayed()->nop();
2695    { Label L;
2696      jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2697      __ set(lh_prim_tag_in_place, O5_temp);
2698      __ cmp(G5_lh,                O5_temp);
2699      __ br(Assembler::greaterEqual, false, Assembler::pt, L);
2700      __ delayed()->nop();
2701      __ stop("must be a primitive array");
2702      __ bind(L);
2703    }
2704#else
2705    __ delayed();                               // match next insn to prev branch
2706#endif
2707
2708    arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2709                           O5_temp, G4_dst_klass, L_failed);
2710
2711    // typeArrayKlass
2712    //
2713    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2714    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2715    //
2716
2717    const Register G4_offset = G4_dst_klass;    // array offset
2718    const Register G3_elsize = G3_src_klass;    // log2 element size
2719
2720    __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
2721    __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
2722    __ add(src, G4_offset, src);       // src array offset
2723    __ add(dst, G4_offset, dst);       // dst array offset
2724    __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
2725
2726    // next registers should be set before the jump to corresponding stub
2727    const Register from     = O0;  // source array address
2728    const Register to       = O1;  // destination array address
2729    const Register count    = O2;  // elements count
2730
2731    // 'from', 'to', 'count' registers should be set in this order
2732    // since they are the same as 'src', 'src_pos', 'dst'.
2733
2734    BLOCK_COMMENT("scale indexes to element size");
2735    __ sll_ptr(src_pos, G3_elsize, src_pos);
2736    __ sll_ptr(dst_pos, G3_elsize, dst_pos);
2737    __ add(src, src_pos, from);       // src_addr
2738    __ add(dst, dst_pos, to);         // dst_addr
2739
2740    BLOCK_COMMENT("choose copy loop based on element size");
2741    __ cmp(G3_elsize, 0);
2742    __ br(Assembler::equal,true,Assembler::pt,StubRoutines::_jbyte_arraycopy);
2743    __ delayed()->signx(length, count); // length
2744
2745    __ cmp(G3_elsize, LogBytesPerShort);
2746    __ br(Assembler::equal,true,Assembler::pt,StubRoutines::_jshort_arraycopy);
2747    __ delayed()->signx(length, count); // length
2748
2749    __ cmp(G3_elsize, LogBytesPerInt);
2750    __ br(Assembler::equal,true,Assembler::pt,StubRoutines::_jint_arraycopy);
2751    __ delayed()->signx(length, count); // length
2752#ifdef ASSERT
2753    { Label L;
2754      __ cmp(G3_elsize, LogBytesPerLong);
2755      __ br(Assembler::equal, false, Assembler::pt, L);
2756      __ delayed()->nop();
2757      __ stop("must be long copy, but elsize is wrong");
2758      __ bind(L);
2759    }
2760#endif
2761    __ br(Assembler::always,false,Assembler::pt,StubRoutines::_jlong_arraycopy);
2762    __ delayed()->signx(length, count); // length
2763
2764    // objArrayKlass
2765  __ BIND(L_objArray);
2766    // live at this point:  G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
2767
2768    Label L_plain_copy, L_checkcast_copy;
2769    //  test array classes for subtyping
2770    __ cmp(G3_src_klass, G4_dst_klass);         // usual case is exact equality
2771    __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
2772    __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
2773
2774    // Identically typed arrays can be copied without element-wise checks.
2775    arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2776                           O5_temp, G5_lh, L_failed);
2777
2778    __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
2779    __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
2780    __ sll_ptr(src_pos, LogBytesPerOop, src_pos);
2781    __ sll_ptr(dst_pos, LogBytesPerOop, dst_pos);
2782    __ add(src, src_pos, from);       // src_addr
2783    __ add(dst, dst_pos, to);         // dst_addr
2784  __ BIND(L_plain_copy);
2785    __ br(Assembler::always, false, Assembler::pt,StubRoutines::_oop_arraycopy);
2786    __ delayed()->signx(length, count); // length
2787
2788  __ BIND(L_checkcast_copy);
2789    // live at this point:  G3_src_klass, G4_dst_klass
2790    {
2791      // Before looking at dst.length, make sure dst is also an objArray.
2792      // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
2793      __ cmp(G5_lh,                    O5_temp);
2794      __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
2795
2796      // It is safe to examine both src.length and dst.length.
2797      __ delayed();                             // match next insn to prev branch
2798      arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2799                             O5_temp, G5_lh, L_failed);
2800
2801      // Marshal the base address arguments now, freeing registers.
2802      __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
2803      __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
2804      __ sll_ptr(src_pos, LogBytesPerOop, src_pos);
2805      __ sll_ptr(dst_pos, LogBytesPerOop, dst_pos);
2806      __ add(src, src_pos, from);               // src_addr
2807      __ add(dst, dst_pos, to);                 // dst_addr
2808      __ signx(length, count);                  // length (reloaded)
2809
2810      Register sco_temp = O3;                   // this register is free now
2811      assert_different_registers(from, to, count, sco_temp,
2812                                 G4_dst_klass, G3_src_klass);
2813
2814      // Generate the type check.
2815      int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
2816                        Klass::super_check_offset_offset_in_bytes());
2817      __ lduw(G4_dst_klass, sco_offset, sco_temp);
2818      generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
2819                          O5_temp, L_plain_copy);
2820
2821      // Fetch destination element klass from the objArrayKlass header.
2822      int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
2823                       objArrayKlass::element_klass_offset_in_bytes());
2824
2825      // the checkcast_copy loop needs two extra arguments:
2826      __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
2827      // lduw(O4, sco_offset, O3);              // sco of elem klass
2828
2829      __ br(Assembler::always, false, Assembler::pt, checkcast_copy_entry);
2830      __ delayed()->lduw(O4, sco_offset, O3);
2831    }
2832
2833  __ BIND(L_failed);
2834    __ retl();
2835    __ delayed()->sub(G0, 1, O0); // return -1
2836    return start;
2837  }
2838
2839  void generate_arraycopy_stubs() {
2840
2841    // Note:  the disjoint stubs must be generated first, some of
2842    //        the conjoint stubs use them.
2843    StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
2844    StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
2845    StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
2846    StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
2847    StubRoutines::_oop_disjoint_arraycopy    = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy");
2848    StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
2849    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
2850    StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");
2851    StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
2852    StubRoutines::_arrayof_oop_disjoint_arraycopy    =  generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy");
2853
2854    StubRoutines::_jbyte_arraycopy  = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
2855    StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
2856    StubRoutines::_jint_arraycopy   = generate_conjoint_int_copy(false, "jint_arraycopy");
2857    StubRoutines::_jlong_arraycopy  = generate_conjoint_long_copy(false, "jlong_arraycopy");
2858    StubRoutines::_oop_arraycopy    = generate_conjoint_oop_copy(false, "oop_arraycopy");
2859    StubRoutines::_arrayof_jbyte_arraycopy    = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
2860    StubRoutines::_arrayof_jshort_arraycopy   = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
2861#ifdef _LP64
2862    // since sizeof(jint) < sizeof(HeapWord), there's a different flavor:
2863    StubRoutines::_arrayof_jint_arraycopy     = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");
2864  #else
2865    StubRoutines::_arrayof_jint_arraycopy     = StubRoutines::_jint_arraycopy;
2866#endif
2867    StubRoutines::_arrayof_jlong_arraycopy    = StubRoutines::_jlong_arraycopy;
2868    StubRoutines::_arrayof_oop_arraycopy      = StubRoutines::_oop_arraycopy;
2869
2870    StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy");
2871    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy");
2872    StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy");
2873  }
2874
2875  void generate_initial() {
2876    // Generates all stubs and initializes the entry points
2877
2878    //------------------------------------------------------------------------------------------------------------------------
2879    // entry points that exist in all platforms
2880    // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
2881    //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
2882    StubRoutines::_forward_exception_entry                 = generate_forward_exception();
2883
2884    StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
2885    StubRoutines::_catch_exception_entry                   = generate_catch_exception();
2886
2887    //------------------------------------------------------------------------------------------------------------------------
2888    // entry points that are platform specific
2889    StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
2890
2891    StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
2892    StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
2893
2894#if !defined(COMPILER2) && !defined(_LP64)
2895    StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
2896    StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
2897    StubRoutines::_atomic_add_entry          = generate_atomic_add();
2898    StubRoutines::_atomic_xchg_ptr_entry     = StubRoutines::_atomic_xchg_entry;
2899    StubRoutines::_atomic_cmpxchg_ptr_entry  = StubRoutines::_atomic_cmpxchg_entry;
2900    StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
2901    StubRoutines::_atomic_add_ptr_entry      = StubRoutines::_atomic_add_entry;
2902    StubRoutines::_fence_entry               = generate_fence();
2903#endif  // COMPILER2 !=> _LP64
2904
2905    StubRoutines::Sparc::_partial_subtype_check                = generate_partial_subtype_check();
2906  }
2907
2908
2909  void generate_all() {
2910    // Generates all stubs and initializes the entry points
2911
2912    // These entry points require SharedInfo::stack0 to be set up in non-core builds
2913    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
2914    StubRoutines::_throw_ArithmeticException_entry         = generate_throw_exception("ArithmeticException throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException),  true);
2915    StubRoutines::_throw_NullPointerException_entry        = generate_throw_exception("NullPointerException throw_exception",         CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
2916    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
2917    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError),   false);
2918
2919    StubRoutines::_handler_for_unsafe_access_entry =
2920      generate_handler_for_unsafe_access();
2921
2922    // support for verify_oop (must happen after universe_init)
2923    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop_subroutine();
2924
2925    // arraycopy stubs used by compilers
2926    generate_arraycopy_stubs();
2927  }
2928
2929
2930 public:
2931  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2932    // replace the standard masm with a special one:
2933    _masm = new MacroAssembler(code);
2934
2935    _stub_count = !all ? 0x100 : 0x200;
2936    if (all) {
2937      generate_all();
2938    } else {
2939      generate_initial();
2940    }
2941
2942    // make sure this stub is available for all local calls
2943    if (_atomic_add_stub.is_unbound()) {
2944      // generate a second time, if necessary
2945      (void) generate_atomic_add();
2946    }
2947  }
2948
2949
2950 private:
2951  int _stub_count;
2952  void stub_prolog(StubCodeDesc* cdesc) {
2953    # ifdef ASSERT
2954      // put extra information in the stub code, to make it more readable
2955#ifdef _LP64
2956// Write the high part of the address
2957// [RGV] Check if there is a dependency on the size of this prolog
2958      __ emit_data((intptr_t)cdesc >> 32,    relocInfo::none);
2959#endif
2960      __ emit_data((intptr_t)cdesc,    relocInfo::none);
2961      __ emit_data(++_stub_count, relocInfo::none);
2962    # endif
2963    align(true);
2964  }
2965
2966  void align(bool at_header = false) {
2967    // %%%%% move this constant somewhere else
2968    // UltraSPARC cache line size is 8 instructions:
2969    const unsigned int icache_line_size = 32;
2970    const unsigned int icache_half_line_size = 16;
2971
2972    if (at_header) {
2973      while ((intptr_t)(__ pc()) % icache_line_size != 0) {
2974        __ emit_data(0, relocInfo::none);
2975      }
2976    } else {
2977      while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
2978        __ nop();
2979      }
2980    }
2981  }
2982
2983}; // end class declaration
2984
2985
2986address StubGenerator::disjoint_byte_copy_entry  = NULL;
2987address StubGenerator::disjoint_short_copy_entry = NULL;
2988address StubGenerator::disjoint_int_copy_entry   = NULL;
2989address StubGenerator::disjoint_long_copy_entry  = NULL;
2990address StubGenerator::disjoint_oop_copy_entry   = NULL;
2991
2992address StubGenerator::byte_copy_entry  = NULL;
2993address StubGenerator::short_copy_entry = NULL;
2994address StubGenerator::int_copy_entry   = NULL;
2995address StubGenerator::long_copy_entry  = NULL;
2996address StubGenerator::oop_copy_entry   = NULL;
2997
2998address StubGenerator::checkcast_copy_entry = NULL;
2999
3000void StubGenerator_generate(CodeBuffer* code, bool all) {
3001  StubGenerator g(code, all);
3002}
3003