1/*
2 * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "asm/macroAssembler.inline.hpp"
27#include "interpreter/interpreter.hpp"
28#include "nativeInst_sparc.hpp"
29#include "oops/instanceOop.hpp"
30#include "oops/method.hpp"
31#include "oops/objArrayKlass.hpp"
32#include "oops/oop.inline.hpp"
33#include "prims/methodHandles.hpp"
34#include "runtime/frame.inline.hpp"
35#include "runtime/handles.inline.hpp"
36#include "runtime/sharedRuntime.hpp"
37#include "runtime/stubCodeGenerator.hpp"
38#include "runtime/stubRoutines.hpp"
39#include "runtime/thread.inline.hpp"
40#ifdef COMPILER2
41#include "opto/runtime.hpp"
42#endif
43
44// Declaration and definition of StubGenerator (no .hpp file).
45// For a more detailed description of the stub routine structure
46// see the comment in stubRoutines.hpp.
47
48#define __ _masm->
49
50#ifdef PRODUCT
51#define BLOCK_COMMENT(str) /* nothing */
52#else
53#define BLOCK_COMMENT(str) __ block_comment(str)
54#endif
55
56#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
57
58// Note:  The register L7 is used as L7_thread_cache, and may not be used
59//        any other way within this module.
60
61
62static const Register& Lstub_temp = L2;
63
64// -------------------------------------------------------------------------------------------------------------------------
65// Stub Code definitions
66
67class StubGenerator: public StubCodeGenerator {
68 private:
69
70#ifdef PRODUCT
71#define inc_counter_np(a,b,c)
72#else
73#define inc_counter_np(counter, t1, t2) \
74  BLOCK_COMMENT("inc_counter " #counter); \
75  __ inc_counter(&counter, t1, t2);
76#endif
77
78  //----------------------------------------------------------------------------------------------------
79  // Call stubs are used to call Java from C
80
81  address generate_call_stub(address& return_pc) {
82    StubCodeMark mark(this, "StubRoutines", "call_stub");
83    address start = __ pc();
84
85    // Incoming arguments:
86    //
87    // o0         : call wrapper address
88    // o1         : result (address)
89    // o2         : result type
90    // o3         : method
91    // o4         : (interpreter) entry point
92    // o5         : parameters (address)
93    // [sp + 0x5c]: parameter size (in words)
94    // [sp + 0x60]: thread
95    //
96    // +---------------+ <--- sp + 0
97    // |               |
98    // . reg save area .
99    // |               |
100    // +---------------+ <--- sp + 0x40
101    // |               |
102    // . extra 7 slots .
103    // |               |
104    // +---------------+ <--- sp + 0x5c
105    // |  param. size  |
106    // +---------------+ <--- sp + 0x60
107    // |    thread     |
108    // +---------------+
109    // |               |
110
111    // note: if the link argument position changes, adjust
112    //       the code in frame::entry_frame_call_wrapper()
113
114    const Argument link           = Argument(0, false); // used only for GC
115    const Argument result         = Argument(1, false);
116    const Argument result_type    = Argument(2, false);
117    const Argument method         = Argument(3, false);
118    const Argument entry_point    = Argument(4, false);
119    const Argument parameters     = Argument(5, false);
120    const Argument parameter_size = Argument(6, false);
121    const Argument thread         = Argument(7, false);
122
123    // setup thread register
124    __ ld_ptr(thread.as_address(), G2_thread);
125    __ reinit_heapbase();
126
127#ifdef ASSERT
128    // make sure we have no pending exceptions
129    { const Register t = G3_scratch;
130      Label L;
131      __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
132      __ br_null_short(t, Assembler::pt, L);
133      __ stop("StubRoutines::call_stub: entered with pending exception");
134      __ bind(L);
135    }
136#endif
137
138    // create activation frame & allocate space for parameters
139    { const Register t = G3_scratch;
140      __ ld_ptr(parameter_size.as_address(), t);                // get parameter size (in words)
141      __ add(t, frame::memory_parameter_word_sp_offset, t);     // add space for save area (in words)
142      __ round_to(t, WordsPerLong);                             // make sure it is multiple of 2 (in words)
143      __ sll(t, Interpreter::logStackElementSize, t);           // compute number of bytes
144      __ neg(t);                                                // negate so it can be used with save
145      __ save(SP, t, SP);                                       // setup new frame
146    }
147
148    // +---------------+ <--- sp + 0
149    // |               |
150    // . reg save area .
151    // |               |
152    // +---------------+ <--- sp + 0x40
153    // |               |
154    // . extra 7 slots .
155    // |               |
156    // +---------------+ <--- sp + 0x5c
157    // |  empty slot   |      (only if parameter size is even)
158    // +---------------+
159    // |               |
160    // .  parameters   .
161    // |               |
162    // +---------------+ <--- fp + 0
163    // |               |
164    // . reg save area .
165    // |               |
166    // +---------------+ <--- fp + 0x40
167    // |               |
168    // . extra 7 slots .
169    // |               |
170    // +---------------+ <--- fp + 0x5c
171    // |  param. size  |
172    // +---------------+ <--- fp + 0x60
173    // |    thread     |
174    // +---------------+
175    // |               |
176
177    // pass parameters if any
178    BLOCK_COMMENT("pass parameters if any");
179    { const Register src = parameters.as_in().as_register();
180      const Register dst = Lentry_args;
181      const Register tmp = G3_scratch;
182      const Register cnt = G4_scratch;
183
184      // test if any parameters & setup of Lentry_args
185      Label exit;
186      __ ld_ptr(parameter_size.as_in().as_address(), cnt);      // parameter counter
187      __ add( FP, STACK_BIAS, dst );
188      __ cmp_zero_and_br(Assembler::zero, cnt, exit);
189      __ delayed()->sub(dst, BytesPerWord, dst);                 // setup Lentry_args
190
191      // copy parameters if any
192      Label loop;
193      __ BIND(loop);
194      // Store parameter value
195      __ ld_ptr(src, 0, tmp);
196      __ add(src, BytesPerWord, src);
197      __ st_ptr(tmp, dst, 0);
198      __ deccc(cnt);
199      __ br(Assembler::greater, false, Assembler::pt, loop);
200      __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
201
202      // done
203      __ BIND(exit);
204    }
205
206    // setup parameters, method & call Java function
207#ifdef ASSERT
208    // layout_activation_impl checks it's notion of saved SP against
209    // this register, so if this changes update it as well.
210    const Register saved_SP = Lscratch;
211    __ mov(SP, saved_SP);                               // keep track of SP before call
212#endif
213
214    // setup parameters
215    const Register t = G3_scratch;
216    __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
217    __ sll(t, Interpreter::logStackElementSize, t);    // compute number of bytes
218    __ sub(FP, t, Gargs);                              // setup parameter pointer
219#ifdef _LP64
220    __ add( Gargs, STACK_BIAS, Gargs );                // Account for LP64 stack bias
221#endif
222    __ mov(SP, O5_savedSP);
223
224
225    // do the call
226    //
227    // the following register must be setup:
228    //
229    // G2_thread
230    // G5_method
231    // Gargs
232    BLOCK_COMMENT("call Java function");
233    __ jmpl(entry_point.as_in().as_register(), G0, O7);
234    __ delayed()->mov(method.as_in().as_register(), G5_method);   // setup method
235
236    BLOCK_COMMENT("call_stub_return_address:");
237    return_pc = __ pc();
238
239    // The callee, if it wasn't interpreted, can return with SP changed so
240    // we can no longer assert of change of SP.
241
242    // store result depending on type
243    // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
244    //  is treated as T_INT)
245    { const Register addr = result     .as_in().as_register();
246      const Register type = result_type.as_in().as_register();
247      Label is_long, is_float, is_double, is_object, exit;
248      __            cmp(type, T_OBJECT);  __ br(Assembler::equal, false, Assembler::pn, is_object);
249      __ delayed()->cmp(type, T_FLOAT);   __ br(Assembler::equal, false, Assembler::pn, is_float);
250      __ delayed()->cmp(type, T_DOUBLE);  __ br(Assembler::equal, false, Assembler::pn, is_double);
251      __ delayed()->cmp(type, T_LONG);    __ br(Assembler::equal, false, Assembler::pn, is_long);
252      __ delayed()->nop();
253
254      // store int result
255      __ st(O0, addr, G0);
256
257      __ BIND(exit);
258      __ ret();
259      __ delayed()->restore();
260
261      __ BIND(is_object);
262      __ ba(exit);
263      __ delayed()->st_ptr(O0, addr, G0);
264
265      __ BIND(is_float);
266      __ ba(exit);
267      __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
268
269      __ BIND(is_double);
270      __ ba(exit);
271      __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
272
273      __ BIND(is_long);
274#ifdef _LP64
275      __ ba(exit);
276      __ delayed()->st_long(O0, addr, G0);      // store entire long
277#else
278#if defined(COMPILER2)
279  // All return values are where we want them, except for Longs.  C2 returns
280  // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1.
281  // Since the interpreter will return longs in G1 and O0/O1 in the 32bit
282  // build we simply always use G1.
283  // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to
284  // do this here. Unfortunately if we did a rethrow we'd see an machepilog node
285  // first which would move g1 -> O0/O1 and destroy the exception we were throwing.
286
287      __ ba(exit);
288      __ delayed()->stx(G1, addr, G0);  // store entire long
289#else
290      __ st(O1, addr, BytesPerInt);
291      __ ba(exit);
292      __ delayed()->st(O0, addr, G0);
293#endif /* COMPILER2 */
294#endif /* _LP64 */
295     }
296     return start;
297  }
298
299
300  //----------------------------------------------------------------------------------------------------
301  // Return point for a Java call if there's an exception thrown in Java code.
302  // The exception is caught and transformed into a pending exception stored in
303  // JavaThread that can be tested from within the VM.
304  //
305  // Oexception: exception oop
306
307  address generate_catch_exception() {
308    StubCodeMark mark(this, "StubRoutines", "catch_exception");
309
310    address start = __ pc();
311    // verify that thread corresponds
312    __ verify_thread();
313
314    const Register& temp_reg = Gtemp;
315    Address pending_exception_addr    (G2_thread, Thread::pending_exception_offset());
316    Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset   ());
317    Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset   ());
318
319    // set pending exception
320    __ verify_oop(Oexception);
321    __ st_ptr(Oexception, pending_exception_addr);
322    __ set((intptr_t)__FILE__, temp_reg);
323    __ st_ptr(temp_reg, exception_file_offset_addr);
324    __ set((intptr_t)__LINE__, temp_reg);
325    __ st(temp_reg, exception_line_offset_addr);
326
327    // complete return to VM
328    assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
329
330    AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
331    __ jump_to(stub_ret, temp_reg);
332    __ delayed()->nop();
333
334    return start;
335  }
336
337
338  //----------------------------------------------------------------------------------------------------
339  // Continuation point for runtime calls returning with a pending exception
340  // The pending exception check happened in the runtime or native call stub
341  // The pending exception in Thread is converted into a Java-level exception
342  //
343  // Contract with Java-level exception handler: O0 = exception
344  //                                             O1 = throwing pc
345
346  address generate_forward_exception() {
347    StubCodeMark mark(this, "StubRoutines", "forward_exception");
348    address start = __ pc();
349
350    // Upon entry, O7 has the return address returning into Java
351    // (interpreted or compiled) code; i.e. the return address
352    // becomes the throwing pc.
353
354    const Register& handler_reg = Gtemp;
355
356    Address exception_addr(G2_thread, Thread::pending_exception_offset());
357
358#ifdef ASSERT
359    // make sure that this code is only executed if there is a pending exception
360    { Label L;
361      __ ld_ptr(exception_addr, Gtemp);
362      __ br_notnull_short(Gtemp, Assembler::pt, L);
363      __ stop("StubRoutines::forward exception: no pending exception (1)");
364      __ bind(L);
365    }
366#endif
367
368    // compute exception handler into handler_reg
369    __ get_thread();
370    __ ld_ptr(exception_addr, Oexception);
371    __ verify_oop(Oexception);
372    __ save_frame(0);             // compensates for compiler weakness
373    __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
374    BLOCK_COMMENT("call exception_handler_for_return_address");
375    __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
376    __ mov(O0, handler_reg);
377    __ restore();                 // compensates for compiler weakness
378
379    __ ld_ptr(exception_addr, Oexception);
380    __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
381
382#ifdef ASSERT
383    // make sure exception is set
384    { Label L;
385      __ br_notnull_short(Oexception, Assembler::pt, L);
386      __ stop("StubRoutines::forward exception: no pending exception (2)");
387      __ bind(L);
388    }
389#endif
390    // jump to exception handler
391    __ jmp(handler_reg, 0);
392    // clear pending exception
393    __ delayed()->st_ptr(G0, exception_addr);
394
395    return start;
396  }
397
398  // Safefetch stubs.
399  void generate_safefetch(const char* name, int size, address* entry,
400                          address* fault_pc, address* continuation_pc) {
401    // safefetch signatures:
402    //   int      SafeFetch32(int*      adr, int      errValue);
403    //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
404    //
405    // arguments:
406    //   o0 = adr
407    //   o1 = errValue
408    //
409    // result:
410    //   o0  = *adr or errValue
411
412    StubCodeMark mark(this, "StubRoutines", name);
413
414    // Entry point, pc or function descriptor.
415    __ align(CodeEntryAlignment);
416    *entry = __ pc();
417
418    __ mov(O0, G1);  // g1 = o0
419    __ mov(O1, O0);  // o0 = o1
420    // Load *adr into c_rarg1, may fault.
421    *fault_pc = __ pc();
422    switch (size) {
423      case 4:
424        // int32_t
425        __ ldsw(G1, 0, O0);  // o0 = [g1]
426        break;
427      case 8:
428        // int64_t
429        __ ldx(G1, 0, O0);   // o0 = [g1]
430        break;
431      default:
432        ShouldNotReachHere();
433    }
434
435    // return errValue or *adr
436    *continuation_pc = __ pc();
437    // By convention with the trap handler we ensure there is a non-CTI
438    // instruction in the trap shadow.
439    __ nop();
440    __ retl();
441    __ delayed()->nop();
442  }
443
444  //------------------------------------------------------------------------------------------------------------------------
445  // Continuation point for throwing of implicit exceptions that are not handled in
446  // the current activation. Fabricates an exception oop and initiates normal
447  // exception dispatching in this frame. Only callee-saved registers are preserved
448  // (through the normal register window / RegisterMap handling).
449  // If the compiler needs all registers to be preserved between the fault
450  // point and the exception handler then it must assume responsibility for that in
451  // AbstractCompiler::continuation_for_implicit_null_exception or
452  // continuation_for_implicit_division_by_zero_exception. All other implicit
453  // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
454  // either at call sites or otherwise assume that stack unwinding will be initiated,
455  // so caller saved registers were assumed volatile in the compiler.
456
457  // Note that we generate only this stub into a RuntimeStub, because it needs to be
458  // properly traversed and ignored during GC, so we change the meaning of the "__"
459  // macro within this method.
460#undef __
461#define __ masm->
462
463  address generate_throw_exception(const char* name, address runtime_entry,
464                                   Register arg1 = noreg, Register arg2 = noreg) {
465#ifdef ASSERT
466    int insts_size = VerifyThread ? 1 * K : 600;
467#else
468    int insts_size = VerifyThread ? 1 * K : 256;
469#endif /* ASSERT */
470    int locs_size  = 32;
471
472    CodeBuffer      code(name, insts_size, locs_size);
473    MacroAssembler* masm = new MacroAssembler(&code);
474
475    __ verify_thread();
476
477    // This is an inlined and slightly modified version of call_VM
478    // which has the ability to fetch the return PC out of thread-local storage
479    __ assert_not_delayed();
480
481    // Note that we always push a frame because on the SPARC
482    // architecture, for all of our implicit exception kinds at call
483    // sites, the implicit exception is taken before the callee frame
484    // is pushed.
485    __ save_frame(0);
486
487    int frame_complete = __ offset();
488
489    // Note that we always have a runtime stub frame on the top of stack by this point
490    Register last_java_sp = SP;
491    // 64-bit last_java_sp is biased!
492    __ set_last_Java_frame(last_java_sp, G0);
493    if (VerifyThread)  __ mov(G2_thread, O0); // about to be smashed; pass early
494    __ save_thread(noreg);
495    if (arg1 != noreg) {
496      assert(arg2 != O1, "clobbered");
497      __ mov(arg1, O1);
498    }
499    if (arg2 != noreg) {
500      __ mov(arg2, O2);
501    }
502    // do the call
503    BLOCK_COMMENT("call runtime_entry");
504    __ call(runtime_entry, relocInfo::runtime_call_type);
505    if (!VerifyThread)
506      __ delayed()->mov(G2_thread, O0);  // pass thread as first argument
507    else
508      __ delayed()->nop();             // (thread already passed)
509    __ restore_thread(noreg);
510    __ reset_last_Java_frame();
511
512    // check for pending exceptions. use Gtemp as scratch register.
513#ifdef ASSERT
514    Label L;
515
516    Address exception_addr(G2_thread, Thread::pending_exception_offset());
517    Register scratch_reg = Gtemp;
518    __ ld_ptr(exception_addr, scratch_reg);
519    __ br_notnull_short(scratch_reg, Assembler::pt, L);
520    __ should_not_reach_here();
521    __ bind(L);
522#endif // ASSERT
523    BLOCK_COMMENT("call forward_exception_entry");
524    __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
525    // we use O7 linkage so that forward_exception_entry has the issuing PC
526    __ delayed()->restore();
527
528    RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
529    return stub->entry_point();
530  }
531
532#undef __
533#define __ _masm->
534
535
536  // Generate a routine that sets all the registers so we
537  // can tell if the stop routine prints them correctly.
538  address generate_test_stop() {
539    StubCodeMark mark(this, "StubRoutines", "test_stop");
540    address start = __ pc();
541
542    int i;
543
544    __ save_frame(0);
545
546    static jfloat zero = 0.0, one = 1.0;
547
548    // put addr in L0, then load through L0 to F0
549    __ set((intptr_t)&zero, L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F0);
550    __ set((intptr_t)&one,  L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
551
552    // use add to put 2..18 in F2..F18
553    for ( i = 2;  i <= 18;  ++i ) {
554      __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1),  as_FloatRegister(i));
555    }
556
557    // Now put double 2 in F16, double 18 in F18
558    __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
559    __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
560
561    // use add to put 20..32 in F20..F32
562    for (i = 20; i < 32; i += 2) {
563      __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2),  as_FloatRegister(i));
564    }
565
566    // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
567    for ( i = 0; i < 8; ++i ) {
568      if (i < 6) {
569        __ set(     i, as_iRegister(i));
570        __ set(16 + i, as_oRegister(i));
571        __ set(24 + i, as_gRegister(i));
572      }
573      __ set( 8 + i, as_lRegister(i));
574    }
575
576    __ stop("testing stop");
577
578
579    __ ret();
580    __ delayed()->restore();
581
582    return start;
583  }
584
585
586  address generate_stop_subroutine() {
587    StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
588    address start = __ pc();
589
590    __ stop_subroutine();
591
592    return start;
593  }
594
595  address generate_flush_callers_register_windows() {
596    StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
597    address start = __ pc();
598
599    __ flushw();
600    __ retl(false);
601    __ delayed()->add( FP, STACK_BIAS, O0 );
602    // The returned value must be a stack pointer whose register save area
603    // is flushed, and will stay flushed while the caller executes.
604
605    return start;
606  }
607
608  // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
609  //
610  // Arguments:
611  //
612  //      exchange_value: O0
613  //      dest:           O1
614  //
615  // Results:
616  //
617  //     O0: the value previously stored in dest
618  //
619  address generate_atomic_xchg() {
620    StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
621    address start = __ pc();
622
623    if (UseCASForSwap) {
624      // Use CAS instead of swap, just in case the MP hardware
625      // prefers to work with just one kind of synch. instruction.
626      Label retry;
627      __ BIND(retry);
628      __ mov(O0, O3);       // scratch copy of exchange value
629      __ ld(O1, 0, O2);     // observe the previous value
630      // try to replace O2 with O3
631      __ cas(O1, O2, O3);
632      __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
633
634      __ retl(false);
635      __ delayed()->mov(O2, O0);  // report previous value to caller
636    } else {
637      __ retl(false);
638      __ delayed()->swap(O1, 0, O0);
639    }
640
641    return start;
642  }
643
644
645  // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
646  //
647  // Arguments:
648  //
649  //      exchange_value: O0
650  //      dest:           O1
651  //      compare_value:  O2
652  //
653  // Results:
654  //
655  //     O0: the value previously stored in dest
656  //
657  address generate_atomic_cmpxchg() {
658    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
659    address start = __ pc();
660
661    // cmpxchg(dest, compare_value, exchange_value)
662    __ cas(O1, O2, O0);
663    __ retl(false);
664    __ delayed()->nop();
665
666    return start;
667  }
668
669  // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
670  //
671  // Arguments:
672  //
673  //      exchange_value: O1:O0
674  //      dest:           O2
675  //      compare_value:  O4:O3
676  //
677  // Results:
678  //
679  //     O1:O0: the value previously stored in dest
680  //
681  // Overwrites: G1,G2,G3
682  //
683  address generate_atomic_cmpxchg_long() {
684    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
685    address start = __ pc();
686
687    __ sllx(O0, 32, O0);
688    __ srl(O1, 0, O1);
689    __ or3(O0,O1,O0);      // O0 holds 64-bit value from compare_value
690    __ sllx(O3, 32, O3);
691    __ srl(O4, 0, O4);
692    __ or3(O3,O4,O3);     // O3 holds 64-bit value from exchange_value
693    __ casx(O2, O3, O0);
694    __ srl(O0, 0, O1);    // unpacked return value in O1:O0
695    __ retl(false);
696    __ delayed()->srlx(O0, 32, O0);
697
698    return start;
699  }
700
701
702  // Support for jint Atomic::add(jint add_value, volatile jint* dest).
703  //
704  // Arguments:
705  //
706  //      add_value: O0   (e.g., +1 or -1)
707  //      dest:      O1
708  //
709  // Results:
710  //
711  //     O0: the new value stored in dest
712  //
713  // Overwrites: O3
714  //
715  address generate_atomic_add() {
716    StubCodeMark mark(this, "StubRoutines", "atomic_add");
717    address start = __ pc();
718    __ BIND(_atomic_add_stub);
719
720    Label(retry);
721    __ BIND(retry);
722
723    __ lduw(O1, 0, O2);
724    __ add(O0, O2, O3);
725    __ cas(O1, O2, O3);
726    __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
727    __ retl(false);
728    __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
729
730    return start;
731  }
732  Label _atomic_add_stub;  // called from other stubs
733
734
735  // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
736  // Arguments :
737  //
738  //      ret  : O0, returned
739  //      icc/xcc: set as O0 (depending on wordSize)
740  //      sub  : O1, argument, not changed
741  //      super: O2, argument, not changed
742  //      raddr: O7, blown by call
743  address generate_partial_subtype_check() {
744    __ align(CodeEntryAlignment);
745    StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
746    address start = __ pc();
747    Label miss;
748
749#if defined(COMPILER2) && !defined(_LP64)
750    // Do not use a 'save' because it blows the 64-bit O registers.
751    __ add(SP,-4*wordSize,SP);  // Make space for 4 temps (stack must be 2 words aligned)
752    __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize);
753    __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize);
754    __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize);
755    __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize);
756    Register Rret   = O0;
757    Register Rsub   = O1;
758    Register Rsuper = O2;
759#else
760    __ save_frame(0);
761    Register Rret   = I0;
762    Register Rsub   = I1;
763    Register Rsuper = I2;
764#endif
765
766    Register L0_ary_len = L0;
767    Register L1_ary_ptr = L1;
768    Register L2_super   = L2;
769    Register L3_index   = L3;
770
771    __ check_klass_subtype_slow_path(Rsub, Rsuper,
772                                     L0, L1, L2, L3,
773                                     NULL, &miss);
774
775    // Match falls through here.
776    __ addcc(G0,0,Rret);        // set Z flags, Z result
777
778#if defined(COMPILER2) && !defined(_LP64)
779    __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
780    __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
781    __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
782    __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
783    __ retl();                  // Result in Rret is zero; flags set to Z
784    __ delayed()->add(SP,4*wordSize,SP);
785#else
786    __ ret();                   // Result in Rret is zero; flags set to Z
787    __ delayed()->restore();
788#endif
789
790    __ BIND(miss);
791    __ addcc(G0,1,Rret);        // set NZ flags, NZ result
792
793#if defined(COMPILER2) && !defined(_LP64)
794    __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
795    __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
796    __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
797    __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
798    __ retl();                  // Result in Rret is != 0; flags set to NZ
799    __ delayed()->add(SP,4*wordSize,SP);
800#else
801    __ ret();                   // Result in Rret is != 0; flags set to NZ
802    __ delayed()->restore();
803#endif
804
805    return start;
806  }
807
808
809  // Called from MacroAssembler::verify_oop
810  //
811  address generate_verify_oop_subroutine() {
812    StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
813
814    address start = __ pc();
815
816    __ verify_oop_subroutine();
817
818    return start;
819  }
820
821
822  //
823  // Verify that a register contains clean 32-bits positive value
824  // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
825  //
826  //  Input:
827  //    Rint  -  32-bits value
828  //    Rtmp  -  scratch
829  //
830  void assert_clean_int(Register Rint, Register Rtmp) {
831#if defined(ASSERT) && defined(_LP64)
832    __ signx(Rint, Rtmp);
833    __ cmp(Rint, Rtmp);
834    __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
835#endif
836  }
837
838  //
839  //  Generate overlap test for array copy stubs
840  //
841  //  Input:
842  //    O0    -  array1
843  //    O1    -  array2
844  //    O2    -  element count
845  //
846  //  Kills temps:  O3, O4
847  //
848  void array_overlap_test(address no_overlap_target, int log2_elem_size) {
849    assert(no_overlap_target != NULL, "must be generated");
850    array_overlap_test(no_overlap_target, NULL, log2_elem_size);
851  }
852  void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
853    array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
854  }
855  void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
856    const Register from       = O0;
857    const Register to         = O1;
858    const Register count      = O2;
859    const Register to_from    = O3; // to - from
860    const Register byte_count = O4; // count << log2_elem_size
861
862      __ subcc(to, from, to_from);
863      __ sll_ptr(count, log2_elem_size, byte_count);
864      if (NOLp == NULL)
865        __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
866      else
867        __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
868      __ delayed()->cmp(to_from, byte_count);
869      if (NOLp == NULL)
870        __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
871      else
872        __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
873      __ delayed()->nop();
874  }
875
876  //
877  //  Generate pre-write barrier for array.
878  //
879  //  Input:
880  //     addr     - register containing starting address
881  //     count    - register containing element count
882  //     tmp      - scratch register
883  //
884  //  The input registers are overwritten.
885  //
886  void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
887    BarrierSet* bs = Universe::heap()->barrier_set();
888    switch (bs->kind()) {
889      case BarrierSet::G1SATBCTLogging:
890        // With G1, don't generate the call if we statically know that the target in uninitialized
891        if (!dest_uninitialized) {
892          __ save_frame(0);
893          // Save the necessary global regs... will be used after.
894          if (addr->is_global()) {
895            __ mov(addr, L0);
896          }
897          if (count->is_global()) {
898            __ mov(count, L1);
899          }
900          __ mov(addr->after_save(), O0);
901          // Get the count into O1
902          __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
903          __ delayed()->mov(count->after_save(), O1);
904          if (addr->is_global()) {
905            __ mov(L0, addr);
906          }
907          if (count->is_global()) {
908            __ mov(L1, count);
909          }
910          __ restore();
911        }
912        break;
913      case BarrierSet::CardTableForRS:
914      case BarrierSet::CardTableExtension:
915      case BarrierSet::ModRef:
916        break;
917      default:
918        ShouldNotReachHere();
919    }
920  }
921  //
922  //  Generate post-write barrier for array.
923  //
924  //  Input:
925  //     addr     - register containing starting address
926  //     count    - register containing element count
927  //     tmp      - scratch register
928  //
929  //  The input registers are overwritten.
930  //
931  void gen_write_ref_array_post_barrier(Register addr, Register count,
932                                        Register tmp) {
933    BarrierSet* bs = Universe::heap()->barrier_set();
934
935    switch (bs->kind()) {
936      case BarrierSet::G1SATBCTLogging:
937        {
938          // Get some new fresh output registers.
939          __ save_frame(0);
940          __ mov(addr->after_save(), O0);
941          __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
942          __ delayed()->mov(count->after_save(), O1);
943          __ restore();
944        }
945        break;
946      case BarrierSet::CardTableForRS:
947      case BarrierSet::CardTableExtension:
948        {
949          CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
950          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
951          assert_different_registers(addr, count, tmp);
952
953          Label L_loop;
954
955          __ sll_ptr(count, LogBytesPerHeapOop, count);
956          __ sub(count, BytesPerHeapOop, count);
957          __ add(count, addr, count);
958          // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
959          __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
960          __ srl_ptr(count, CardTableModRefBS::card_shift, count);
961          __ sub(count, addr, count);
962          AddressLiteral rs(ct->byte_map_base);
963          __ set(rs, tmp);
964        __ BIND(L_loop);
965          __ stb(G0, tmp, addr);
966          __ subcc(count, 1, count);
967          __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
968          __ delayed()->add(addr, 1, addr);
969        }
970        break;
971      case BarrierSet::ModRef:
972        break;
973      default:
974        ShouldNotReachHere();
975    }
976  }
977
978  //
979  // Generate main code for disjoint arraycopy
980  //
981  typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
982                                              Label& L_loop, bool use_prefetch, bool use_bis);
983
984  void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
985                          int iter_size, StubGenerator::CopyLoopFunc copy_loop_func) {
986    Label L_copy;
987
988    assert(log2_elem_size <= 3, "the following code should be changed");
989    int count_dec = 16>>log2_elem_size;
990
991    int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
992    assert(prefetch_dist < 4096, "invalid value");
993    prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
994    int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
995
996    if (UseBlockCopy) {
997      Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
998
999      // 64 bytes tail + bytes copied in one loop iteration
1000      int tail_size = 64 + iter_size;
1001      int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
1002      // Use BIS copy only for big arrays since it requires membar.
1003      __ set(block_copy_count, O4);
1004      __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
1005      // This code is for disjoint source and destination:
1006      //   to <= from || to >= from+count
1007      // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
1008      __ sub(from, to, O4);
1009      __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
1010      __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
1011
1012      __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
1013      // BIS should not be used to copy tail (64 bytes+iter_size)
1014      // to avoid zeroing of following values.
1015      __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
1016
1017      if (prefetch_count > 0) { // rounded up to one iteration count
1018        // Do prefetching only if copy size is bigger
1019        // than prefetch distance.
1020        __ set(prefetch_count, O4);
1021        __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
1022        __ sub(count, prefetch_count, count);
1023
1024        (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
1025        __ add(count, prefetch_count, count); // restore count
1026
1027      } // prefetch_count > 0
1028
1029      (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
1030      __ add(count, (tail_size>>log2_elem_size), count); // restore count
1031
1032      __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
1033      // BIS needs membar.
1034      __ membar(Assembler::StoreLoad);
1035      // Copy tail
1036      __ ba_short(L_copy);
1037
1038      __ BIND(L_skip_block_copy);
1039    } // UseBlockCopy
1040
1041    if (prefetch_count > 0) { // rounded up to one iteration count
1042      // Do prefetching only if copy size is bigger
1043      // than prefetch distance.
1044      __ set(prefetch_count, O4);
1045      __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
1046      __ sub(count, prefetch_count, count);
1047
1048      Label L_copy_prefetch;
1049      (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
1050      __ add(count, prefetch_count, count); // restore count
1051
1052    } // prefetch_count > 0
1053
1054    (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
1055  }
1056
1057
1058
1059  //
1060  // Helper methods for copy_16_bytes_forward_with_shift()
1061  //
1062  void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
1063                                Label& L_loop, bool use_prefetch, bool use_bis) {
1064
1065    const Register left_shift  = G1; // left  shift bit counter
1066    const Register right_shift = G5; // right shift bit counter
1067
1068    __ align(OptoLoopAlignment);
1069    __ BIND(L_loop);
1070    if (use_prefetch) {
1071      if (ArraycopySrcPrefetchDistance > 0) {
1072        __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1073      }
1074      if (ArraycopyDstPrefetchDistance > 0) {
1075        __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1076      }
1077    }
1078    __ ldx(from, 0, O4);
1079    __ ldx(from, 8, G4);
1080    __ inc(to, 16);
1081    __ inc(from, 16);
1082    __ deccc(count, count_dec); // Can we do next iteration after this one?
1083    __ srlx(O4, right_shift, G3);
1084    __ bset(G3, O3);
1085    __ sllx(O4, left_shift,  O4);
1086    __ srlx(G4, right_shift, G3);
1087    __ bset(G3, O4);
1088    if (use_bis) {
1089      __ stxa(O3, to, -16);
1090      __ stxa(O4, to, -8);
1091    } else {
1092      __ stx(O3, to, -16);
1093      __ stx(O4, to, -8);
1094    }
1095    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1096    __ delayed()->sllx(G4, left_shift,  O3);
1097  }
1098
1099  // Copy big chunks forward with shift
1100  //
1101  // Inputs:
1102  //   from      - source arrays
1103  //   to        - destination array aligned to 8-bytes
1104  //   count     - elements count to copy >= the count equivalent to 16 bytes
1105  //   count_dec - elements count's decrement equivalent to 16 bytes
1106  //   L_copy_bytes - copy exit label
1107  //
1108  void copy_16_bytes_forward_with_shift(Register from, Register to,
1109                     Register count, int log2_elem_size, Label& L_copy_bytes) {
1110    Label L_aligned_copy, L_copy_last_bytes;
1111    assert(log2_elem_size <= 3, "the following code should be changed");
1112    int count_dec = 16>>log2_elem_size;
1113
1114    // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1115    __ andcc(from, 7, G1); // misaligned bytes
1116    __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1117    __ delayed()->nop();
1118
1119    const Register left_shift  = G1; // left  shift bit counter
1120    const Register right_shift = G5; // right shift bit counter
1121
1122    __ sll(G1, LogBitsPerByte, left_shift);
1123    __ mov(64, right_shift);
1124    __ sub(right_shift, left_shift, right_shift);
1125
1126    //
1127    // Load 2 aligned 8-bytes chunks and use one from previous iteration
1128    // to form 2 aligned 8-bytes chunks to store.
1129    //
1130    __ dec(count, count_dec);   // Pre-decrement 'count'
1131    __ andn(from, 7, from);     // Align address
1132    __ ldx(from, 0, O3);
1133    __ inc(from, 8);
1134    __ sllx(O3, left_shift,  O3);
1135
1136    disjoint_copy_core(from, to, count, log2_elem_size, 16, &StubGenerator::copy_16_bytes_shift_loop);
1137
1138    __ inccc(count, count_dec>>1 ); // + 8 bytes
1139    __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1140    __ delayed()->inc(count, count_dec>>1); // restore 'count'
1141
1142    // copy 8 bytes, part of them already loaded in O3
1143    __ ldx(from, 0, O4);
1144    __ inc(to, 8);
1145    __ inc(from, 8);
1146    __ srlx(O4, right_shift, G3);
1147    __ bset(O3, G3);
1148    __ stx(G3, to, -8);
1149
1150    __ BIND(L_copy_last_bytes);
1151    __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1152    __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1153    __ delayed()->sub(from, right_shift, from);       // restore address
1154
1155    __ BIND(L_aligned_copy);
1156  }
1157
1158  // Copy big chunks backward with shift
1159  //
1160  // Inputs:
1161  //   end_from  - source arrays end address
1162  //   end_to    - destination array end address aligned to 8-bytes
1163  //   count     - elements count to copy >= the count equivalent to 16 bytes
1164  //   count_dec - elements count's decrement equivalent to 16 bytes
1165  //   L_aligned_copy - aligned copy exit label
1166  //   L_copy_bytes   - copy exit label
1167  //
1168  void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
1169                     Register count, int count_dec,
1170                     Label& L_aligned_copy, Label& L_copy_bytes) {
1171    Label L_loop, L_copy_last_bytes;
1172
1173    // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1174      __ andcc(end_from, 7, G1); // misaligned bytes
1175      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1176      __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
1177
1178    const Register left_shift  = G1; // left  shift bit counter
1179    const Register right_shift = G5; // right shift bit counter
1180
1181      __ sll(G1, LogBitsPerByte, left_shift);
1182      __ mov(64, right_shift);
1183      __ sub(right_shift, left_shift, right_shift);
1184
1185    //
1186    // Load 2 aligned 8-bytes chunks and use one from previous iteration
1187    // to form 2 aligned 8-bytes chunks to store.
1188    //
1189      __ andn(end_from, 7, end_from);     // Align address
1190      __ ldx(end_from, 0, O3);
1191      __ align(OptoLoopAlignment);
1192    __ BIND(L_loop);
1193      __ ldx(end_from, -8, O4);
1194      __ deccc(count, count_dec); // Can we do next iteration after this one?
1195      __ ldx(end_from, -16, G4);
1196      __ dec(end_to, 16);
1197      __ dec(end_from, 16);
1198      __ srlx(O3, right_shift, O3);
1199      __ sllx(O4, left_shift,  G3);
1200      __ bset(G3, O3);
1201      __ stx(O3, end_to, 8);
1202      __ srlx(O4, right_shift, O4);
1203      __ sllx(G4, left_shift,  G3);
1204      __ bset(G3, O4);
1205      __ stx(O4, end_to, 0);
1206      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1207      __ delayed()->mov(G4, O3);
1208
1209      __ inccc(count, count_dec>>1 ); // + 8 bytes
1210      __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1211      __ delayed()->inc(count, count_dec>>1); // restore 'count'
1212
1213      // copy 8 bytes, part of them already loaded in O3
1214      __ ldx(end_from, -8, O4);
1215      __ dec(end_to, 8);
1216      __ dec(end_from, 8);
1217      __ srlx(O3, right_shift, O3);
1218      __ sllx(O4, left_shift,  G3);
1219      __ bset(O3, G3);
1220      __ stx(G3, end_to, 0);
1221
1222    __ BIND(L_copy_last_bytes);
1223      __ srl(left_shift, LogBitsPerByte, left_shift);    // misaligned bytes
1224      __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1225      __ delayed()->add(end_from, left_shift, end_from); // restore address
1226  }
1227
1228  //
1229  //  Generate stub for disjoint byte copy.  If "aligned" is true, the
1230  //  "from" and "to" addresses are assumed to be heapword aligned.
1231  //
1232  // Arguments for generated stub:
1233  //      from:  O0
1234  //      to:    O1
1235  //      count: O2 treated as signed
1236  //
1237  address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
1238    __ align(CodeEntryAlignment);
1239    StubCodeMark mark(this, "StubRoutines", name);
1240    address start = __ pc();
1241
1242    Label L_skip_alignment, L_align;
1243    Label L_copy_byte, L_copy_byte_loop, L_exit;
1244
1245    const Register from      = O0;   // source array address
1246    const Register to        = O1;   // destination array address
1247    const Register count     = O2;   // elements count
1248    const Register offset    = O5;   // offset from start of arrays
1249    // O3, O4, G3, G4 are used as temp registers
1250
1251    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1252
1253    if (entry != NULL) {
1254      *entry = __ pc();
1255      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1256      BLOCK_COMMENT("Entry:");
1257    }
1258
1259    // for short arrays, just do single element copy
1260    __ cmp(count, 23); // 16 + 7
1261    __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1262    __ delayed()->mov(G0, offset);
1263
1264    if (aligned) {
1265      // 'aligned' == true when it is known statically during compilation
1266      // of this arraycopy call site that both 'from' and 'to' addresses
1267      // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1268      //
1269      // Aligned arrays have 4 bytes alignment in 32-bits VM
1270      // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
1271      //
1272#ifndef _LP64
1273      // copy a 4-bytes word if necessary to align 'to' to 8 bytes
1274      __ andcc(to, 7, G0);
1275      __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment);
1276      __ delayed()->ld(from, 0, O3);
1277      __ inc(from, 4);
1278      __ inc(to, 4);
1279      __ dec(count, 4);
1280      __ st(O3, to, -4);
1281    __ BIND(L_skip_alignment);
1282#endif
1283    } else {
1284      // copy bytes to align 'to' on 8 byte boundary
1285      __ andcc(to, 7, G1); // misaligned bytes
1286      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1287      __ delayed()->neg(G1);
1288      __ inc(G1, 8);       // bytes need to copy to next 8-bytes alignment
1289      __ sub(count, G1, count);
1290    __ BIND(L_align);
1291      __ ldub(from, 0, O3);
1292      __ deccc(G1);
1293      __ inc(from);
1294      __ stb(O3, to, 0);
1295      __ br(Assembler::notZero, false, Assembler::pt, L_align);
1296      __ delayed()->inc(to);
1297    __ BIND(L_skip_alignment);
1298    }
1299#ifdef _LP64
1300    if (!aligned)
1301#endif
1302    {
1303      // Copy with shift 16 bytes per iteration if arrays do not have
1304      // the same alignment mod 8, otherwise fall through to the next
1305      // code for aligned copy.
1306      // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1307      // Also jump over aligned copy after the copy with shift completed.
1308
1309      copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
1310    }
1311
1312    // Both array are 8 bytes aligned, copy 16 bytes at a time
1313      __ and3(count, 7, G4); // Save count
1314      __ srl(count, 3, count);
1315     generate_disjoint_long_copy_core(aligned);
1316      __ mov(G4, count);     // Restore count
1317
1318    // copy tailing bytes
1319    __ BIND(L_copy_byte);
1320      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1321      __ align(OptoLoopAlignment);
1322    __ BIND(L_copy_byte_loop);
1323      __ ldub(from, offset, O3);
1324      __ deccc(count);
1325      __ stb(O3, to, offset);
1326      __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1327      __ delayed()->inc(offset);
1328
1329    __ BIND(L_exit);
1330      // O3, O4 are used as temp registers
1331      inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1332      __ retl();
1333      __ delayed()->mov(G0, O0); // return 0
1334    return start;
1335  }
1336
1337  //
1338  //  Generate stub for conjoint byte copy.  If "aligned" is true, the
1339  //  "from" and "to" addresses are assumed to be heapword aligned.
1340  //
1341  // Arguments for generated stub:
1342  //      from:  O0
1343  //      to:    O1
1344  //      count: O2 treated as signed
1345  //
1346  address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1347                                      address *entry, const char *name) {
1348    // Do reverse copy.
1349
1350    __ align(CodeEntryAlignment);
1351    StubCodeMark mark(this, "StubRoutines", name);
1352    address start = __ pc();
1353
1354    Label L_skip_alignment, L_align, L_aligned_copy;
1355    Label L_copy_byte, L_copy_byte_loop, L_exit;
1356
1357    const Register from      = O0;   // source array address
1358    const Register to        = O1;   // destination array address
1359    const Register count     = O2;   // elements count
1360    const Register end_from  = from; // source array end address
1361    const Register end_to    = to;   // destination array end address
1362
1363    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1364
1365    if (entry != NULL) {
1366      *entry = __ pc();
1367      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1368      BLOCK_COMMENT("Entry:");
1369    }
1370
1371    array_overlap_test(nooverlap_target, 0);
1372
1373    __ add(to, count, end_to);       // offset after last copied element
1374
1375    // for short arrays, just do single element copy
1376    __ cmp(count, 23); // 16 + 7
1377    __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1378    __ delayed()->add(from, count, end_from);
1379
1380    {
1381      // Align end of arrays since they could be not aligned even
1382      // when arrays itself are aligned.
1383
1384      // copy bytes to align 'end_to' on 8 byte boundary
1385      __ andcc(end_to, 7, G1); // misaligned bytes
1386      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1387      __ delayed()->nop();
1388      __ sub(count, G1, count);
1389    __ BIND(L_align);
1390      __ dec(end_from);
1391      __ dec(end_to);
1392      __ ldub(end_from, 0, O3);
1393      __ deccc(G1);
1394      __ brx(Assembler::notZero, false, Assembler::pt, L_align);
1395      __ delayed()->stb(O3, end_to, 0);
1396    __ BIND(L_skip_alignment);
1397    }
1398#ifdef _LP64
1399    if (aligned) {
1400      // Both arrays are aligned to 8-bytes in 64-bits VM.
1401      // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1402      // in unaligned case.
1403      __ dec(count, 16);
1404    } else
1405#endif
1406    {
1407      // Copy with shift 16 bytes per iteration if arrays do not have
1408      // the same alignment mod 8, otherwise jump to the next
1409      // code for aligned copy (and substracting 16 from 'count' before jump).
1410      // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1411      // Also jump over aligned copy after the copy with shift completed.
1412
1413      copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
1414                                        L_aligned_copy, L_copy_byte);
1415    }
1416    // copy 4 elements (16 bytes) at a time
1417      __ align(OptoLoopAlignment);
1418    __ BIND(L_aligned_copy);
1419      __ dec(end_from, 16);
1420      __ ldx(end_from, 8, O3);
1421      __ ldx(end_from, 0, O4);
1422      __ dec(end_to, 16);
1423      __ deccc(count, 16);
1424      __ stx(O3, end_to, 8);
1425      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1426      __ delayed()->stx(O4, end_to, 0);
1427      __ inc(count, 16);
1428
1429    // copy 1 element (2 bytes) at a time
1430    __ BIND(L_copy_byte);
1431      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1432      __ align(OptoLoopAlignment);
1433    __ BIND(L_copy_byte_loop);
1434      __ dec(end_from);
1435      __ dec(end_to);
1436      __ ldub(end_from, 0, O4);
1437      __ deccc(count);
1438      __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
1439      __ delayed()->stb(O4, end_to, 0);
1440
1441    __ BIND(L_exit);
1442    // O3, O4 are used as temp registers
1443    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1444    __ retl();
1445    __ delayed()->mov(G0, O0); // return 0
1446    return start;
1447  }
1448
1449  //
1450  //  Generate stub for disjoint short copy.  If "aligned" is true, the
1451  //  "from" and "to" addresses are assumed to be heapword aligned.
1452  //
1453  // Arguments for generated stub:
1454  //      from:  O0
1455  //      to:    O1
1456  //      count: O2 treated as signed
1457  //
1458  address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
1459    __ align(CodeEntryAlignment);
1460    StubCodeMark mark(this, "StubRoutines", name);
1461    address start = __ pc();
1462
1463    Label L_skip_alignment, L_skip_alignment2;
1464    Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1465
1466    const Register from      = O0;   // source array address
1467    const Register to        = O1;   // destination array address
1468    const Register count     = O2;   // elements count
1469    const Register offset    = O5;   // offset from start of arrays
1470    // O3, O4, G3, G4 are used as temp registers
1471
1472    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1473
1474    if (entry != NULL) {
1475      *entry = __ pc();
1476      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1477      BLOCK_COMMENT("Entry:");
1478    }
1479
1480    // for short arrays, just do single element copy
1481    __ cmp(count, 11); // 8 + 3  (22 bytes)
1482    __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1483    __ delayed()->mov(G0, offset);
1484
1485    if (aligned) {
1486      // 'aligned' == true when it is known statically during compilation
1487      // of this arraycopy call site that both 'from' and 'to' addresses
1488      // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1489      //
1490      // Aligned arrays have 4 bytes alignment in 32-bits VM
1491      // and 8 bytes - in 64-bits VM.
1492      //
1493#ifndef _LP64
1494      // copy a 2-elements word if necessary to align 'to' to 8 bytes
1495      __ andcc(to, 7, G0);
1496      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1497      __ delayed()->ld(from, 0, O3);
1498      __ inc(from, 4);
1499      __ inc(to, 4);
1500      __ dec(count, 2);
1501      __ st(O3, to, -4);
1502    __ BIND(L_skip_alignment);
1503#endif
1504    } else {
1505      // copy 1 element if necessary to align 'to' on an 4 bytes
1506      __ andcc(to, 3, G0);
1507      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1508      __ delayed()->lduh(from, 0, O3);
1509      __ inc(from, 2);
1510      __ inc(to, 2);
1511      __ dec(count);
1512      __ sth(O3, to, -2);
1513    __ BIND(L_skip_alignment);
1514
1515      // copy 2 elements to align 'to' on an 8 byte boundary
1516      __ andcc(to, 7, G0);
1517      __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1518      __ delayed()->lduh(from, 0, O3);
1519      __ dec(count, 2);
1520      __ lduh(from, 2, O4);
1521      __ inc(from, 4);
1522      __ inc(to, 4);
1523      __ sth(O3, to, -4);
1524      __ sth(O4, to, -2);
1525    __ BIND(L_skip_alignment2);
1526    }
1527#ifdef _LP64
1528    if (!aligned)
1529#endif
1530    {
1531      // Copy with shift 16 bytes per iteration if arrays do not have
1532      // the same alignment mod 8, otherwise fall through to the next
1533      // code for aligned copy.
1534      // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1535      // Also jump over aligned copy after the copy with shift completed.
1536
1537      copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
1538    }
1539
1540    // Both array are 8 bytes aligned, copy 16 bytes at a time
1541      __ and3(count, 3, G4); // Save
1542      __ srl(count, 2, count);
1543     generate_disjoint_long_copy_core(aligned);
1544      __ mov(G4, count); // restore
1545
1546    // copy 1 element at a time
1547    __ BIND(L_copy_2_bytes);
1548      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1549      __ align(OptoLoopAlignment);
1550    __ BIND(L_copy_2_bytes_loop);
1551      __ lduh(from, offset, O3);
1552      __ deccc(count);
1553      __ sth(O3, to, offset);
1554      __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1555      __ delayed()->inc(offset, 2);
1556
1557    __ BIND(L_exit);
1558      // O3, O4 are used as temp registers
1559      inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1560      __ retl();
1561      __ delayed()->mov(G0, O0); // return 0
1562    return start;
1563  }
1564
1565  //
1566  //  Generate stub for disjoint short fill.  If "aligned" is true, the
1567  //  "to" address is assumed to be heapword aligned.
1568  //
1569  // Arguments for generated stub:
1570  //      to:    O0
1571  //      value: O1
1572  //      count: O2 treated as signed
1573  //
1574  address generate_fill(BasicType t, bool aligned, const char* name) {
1575    __ align(CodeEntryAlignment);
1576    StubCodeMark mark(this, "StubRoutines", name);
1577    address start = __ pc();
1578
1579    const Register to        = O0;   // source array address
1580    const Register value     = O1;   // fill value
1581    const Register count     = O2;   // elements count
1582    // O3 is used as a temp register
1583
1584    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1585
1586    Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
1587    Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
1588
1589    int shift = -1;
1590    switch (t) {
1591       case T_BYTE:
1592        shift = 2;
1593        break;
1594       case T_SHORT:
1595        shift = 1;
1596        break;
1597      case T_INT:
1598         shift = 0;
1599        break;
1600      default: ShouldNotReachHere();
1601    }
1602
1603    BLOCK_COMMENT("Entry:");
1604
1605    if (t == T_BYTE) {
1606      // Zero extend value
1607      __ and3(value, 0xff, value);
1608      __ sllx(value, 8, O3);
1609      __ or3(value, O3, value);
1610    }
1611    if (t == T_SHORT) {
1612      // Zero extend value
1613      __ sllx(value, 48, value);
1614      __ srlx(value, 48, value);
1615    }
1616    if (t == T_BYTE || t == T_SHORT) {
1617      __ sllx(value, 16, O3);
1618      __ or3(value, O3, value);
1619    }
1620
1621    __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
1622    __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
1623    __ delayed()->andcc(count, 1, G0);
1624
1625    if (!aligned && (t == T_BYTE || t == T_SHORT)) {
1626      // align source address at 4 bytes address boundary
1627      if (t == T_BYTE) {
1628        // One byte misalignment happens only for byte arrays
1629        __ andcc(to, 1, G0);
1630        __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
1631        __ delayed()->nop();
1632        __ stb(value, to, 0);
1633        __ inc(to, 1);
1634        __ dec(count, 1);
1635        __ BIND(L_skip_align1);
1636      }
1637      // Two bytes misalignment happens only for byte and short (char) arrays
1638      __ andcc(to, 2, G0);
1639      __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
1640      __ delayed()->nop();
1641      __ sth(value, to, 0);
1642      __ inc(to, 2);
1643      __ dec(count, 1 << (shift - 1));
1644      __ BIND(L_skip_align2);
1645    }
1646#ifdef _LP64
1647    if (!aligned) {
1648#endif
1649    // align to 8 bytes, we know we are 4 byte aligned to start
1650    __ andcc(to, 7, G0);
1651    __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
1652    __ delayed()->nop();
1653    __ stw(value, to, 0);
1654    __ inc(to, 4);
1655    __ dec(count, 1 << shift);
1656    __ BIND(L_fill_32_bytes);
1657#ifdef _LP64
1658    }
1659#endif
1660
1661    if (t == T_INT) {
1662      // Zero extend value
1663      __ srl(value, 0, value);
1664    }
1665    if (t == T_BYTE || t == T_SHORT || t == T_INT) {
1666      __ sllx(value, 32, O3);
1667      __ or3(value, O3, value);
1668    }
1669
1670    Label L_check_fill_8_bytes;
1671    // Fill 32-byte chunks
1672    __ subcc(count, 8 << shift, count);
1673    __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
1674    __ delayed()->nop();
1675
1676    Label L_fill_32_bytes_loop, L_fill_4_bytes;
1677    __ align(16);
1678    __ BIND(L_fill_32_bytes_loop);
1679
1680    __ stx(value, to, 0);
1681    __ stx(value, to, 8);
1682    __ stx(value, to, 16);
1683    __ stx(value, to, 24);
1684
1685    __ subcc(count, 8 << shift, count);
1686    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
1687    __ delayed()->add(to, 32, to);
1688
1689    __ BIND(L_check_fill_8_bytes);
1690    __ addcc(count, 8 << shift, count);
1691    __ brx(Assembler::zero, false, Assembler::pn, L_exit);
1692    __ delayed()->subcc(count, 1 << (shift + 1), count);
1693    __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
1694    __ delayed()->andcc(count, 1<<shift, G0);
1695
1696    //
1697    // length is too short, just fill 8 bytes at a time
1698    //
1699    Label L_fill_8_bytes_loop;
1700    __ BIND(L_fill_8_bytes_loop);
1701    __ stx(value, to, 0);
1702    __ subcc(count, 1 << (shift + 1), count);
1703    __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
1704    __ delayed()->add(to, 8, to);
1705
1706    // fill trailing 4 bytes
1707    __ andcc(count, 1<<shift, G0);  // in delay slot of branches
1708    if (t == T_INT) {
1709      __ BIND(L_fill_elements);
1710    }
1711    __ BIND(L_fill_4_bytes);
1712    __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
1713    if (t == T_BYTE || t == T_SHORT) {
1714      __ delayed()->andcc(count, 1<<(shift-1), G0);
1715    } else {
1716      __ delayed()->nop();
1717    }
1718    __ stw(value, to, 0);
1719    if (t == T_BYTE || t == T_SHORT) {
1720      __ inc(to, 4);
1721      // fill trailing 2 bytes
1722      __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
1723      __ BIND(L_fill_2_bytes);
1724      __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
1725      __ delayed()->andcc(count, 1, count);
1726      __ sth(value, to, 0);
1727      if (t == T_BYTE) {
1728        __ inc(to, 2);
1729        // fill trailing byte
1730        __ andcc(count, 1, count);  // in delay slot of branches
1731        __ BIND(L_fill_byte);
1732        __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1733        __ delayed()->nop();
1734        __ stb(value, to, 0);
1735      } else {
1736        __ BIND(L_fill_byte);
1737      }
1738    } else {
1739      __ BIND(L_fill_2_bytes);
1740    }
1741    __ BIND(L_exit);
1742    __ retl();
1743    __ delayed()->nop();
1744
1745    // Handle copies less than 8 bytes.  Int is handled elsewhere.
1746    if (t == T_BYTE) {
1747      __ BIND(L_fill_elements);
1748      Label L_fill_2, L_fill_4;
1749      // in delay slot __ andcc(count, 1, G0);
1750      __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1751      __ delayed()->andcc(count, 2, G0);
1752      __ stb(value, to, 0);
1753      __ inc(to, 1);
1754      __ BIND(L_fill_2);
1755      __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
1756      __ delayed()->andcc(count, 4, G0);
1757      __ stb(value, to, 0);
1758      __ stb(value, to, 1);
1759      __ inc(to, 2);
1760      __ BIND(L_fill_4);
1761      __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1762      __ delayed()->nop();
1763      __ stb(value, to, 0);
1764      __ stb(value, to, 1);
1765      __ stb(value, to, 2);
1766      __ retl();
1767      __ delayed()->stb(value, to, 3);
1768    }
1769
1770    if (t == T_SHORT) {
1771      Label L_fill_2;
1772      __ BIND(L_fill_elements);
1773      // in delay slot __ andcc(count, 1, G0);
1774      __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1775      __ delayed()->andcc(count, 2, G0);
1776      __ sth(value, to, 0);
1777      __ inc(to, 2);
1778      __ BIND(L_fill_2);
1779      __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1780      __ delayed()->nop();
1781      __ sth(value, to, 0);
1782      __ retl();
1783      __ delayed()->sth(value, to, 2);
1784    }
1785    return start;
1786  }
1787
1788  //
1789  //  Generate stub for conjoint short copy.  If "aligned" is true, the
1790  //  "from" and "to" addresses are assumed to be heapword aligned.
1791  //
1792  // Arguments for generated stub:
1793  //      from:  O0
1794  //      to:    O1
1795  //      count: O2 treated as signed
1796  //
1797  address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1798                                       address *entry, const char *name) {
1799    // Do reverse copy.
1800
1801    __ align(CodeEntryAlignment);
1802    StubCodeMark mark(this, "StubRoutines", name);
1803    address start = __ pc();
1804
1805    Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
1806    Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1807
1808    const Register from      = O0;   // source array address
1809    const Register to        = O1;   // destination array address
1810    const Register count     = O2;   // elements count
1811    const Register end_from  = from; // source array end address
1812    const Register end_to    = to;   // destination array end address
1813
1814    const Register byte_count = O3;  // bytes count to copy
1815
1816    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1817
1818    if (entry != NULL) {
1819      *entry = __ pc();
1820      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1821      BLOCK_COMMENT("Entry:");
1822    }
1823
1824    array_overlap_test(nooverlap_target, 1);
1825
1826    __ sllx(count, LogBytesPerShort, byte_count);
1827    __ add(to, byte_count, end_to);  // offset after last copied element
1828
1829    // for short arrays, just do single element copy
1830    __ cmp(count, 11); // 8 + 3  (22 bytes)
1831    __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1832    __ delayed()->add(from, byte_count, end_from);
1833
1834    {
1835      // Align end of arrays since they could be not aligned even
1836      // when arrays itself are aligned.
1837
1838      // copy 1 element if necessary to align 'end_to' on an 4 bytes
1839      __ andcc(end_to, 3, G0);
1840      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1841      __ delayed()->lduh(end_from, -2, O3);
1842      __ dec(end_from, 2);
1843      __ dec(end_to, 2);
1844      __ dec(count);
1845      __ sth(O3, end_to, 0);
1846    __ BIND(L_skip_alignment);
1847
1848      // copy 2 elements to align 'end_to' on an 8 byte boundary
1849      __ andcc(end_to, 7, G0);
1850      __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1851      __ delayed()->lduh(end_from, -2, O3);
1852      __ dec(count, 2);
1853      __ lduh(end_from, -4, O4);
1854      __ dec(end_from, 4);
1855      __ dec(end_to, 4);
1856      __ sth(O3, end_to, 2);
1857      __ sth(O4, end_to, 0);
1858    __ BIND(L_skip_alignment2);
1859    }
1860#ifdef _LP64
1861    if (aligned) {
1862      // Both arrays are aligned to 8-bytes in 64-bits VM.
1863      // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1864      // in unaligned case.
1865      __ dec(count, 8);
1866    } else
1867#endif
1868    {
1869      // Copy with shift 16 bytes per iteration if arrays do not have
1870      // the same alignment mod 8, otherwise jump to the next
1871      // code for aligned copy (and substracting 8 from 'count' before jump).
1872      // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1873      // Also jump over aligned copy after the copy with shift completed.
1874
1875      copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
1876                                        L_aligned_copy, L_copy_2_bytes);
1877    }
1878    // copy 4 elements (16 bytes) at a time
1879      __ align(OptoLoopAlignment);
1880    __ BIND(L_aligned_copy);
1881      __ dec(end_from, 16);
1882      __ ldx(end_from, 8, O3);
1883      __ ldx(end_from, 0, O4);
1884      __ dec(end_to, 16);
1885      __ deccc(count, 8);
1886      __ stx(O3, end_to, 8);
1887      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1888      __ delayed()->stx(O4, end_to, 0);
1889      __ inc(count, 8);
1890
1891    // copy 1 element (2 bytes) at a time
1892    __ BIND(L_copy_2_bytes);
1893      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1894    __ BIND(L_copy_2_bytes_loop);
1895      __ dec(end_from, 2);
1896      __ dec(end_to, 2);
1897      __ lduh(end_from, 0, O4);
1898      __ deccc(count);
1899      __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
1900      __ delayed()->sth(O4, end_to, 0);
1901
1902    __ BIND(L_exit);
1903    // O3, O4 are used as temp registers
1904    inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1905    __ retl();
1906    __ delayed()->mov(G0, O0); // return 0
1907    return start;
1908  }
1909
1910  //
1911  // Helper methods for generate_disjoint_int_copy_core()
1912  //
1913  void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
1914                          Label& L_loop, bool use_prefetch, bool use_bis) {
1915
1916    __ align(OptoLoopAlignment);
1917    __ BIND(L_loop);
1918    if (use_prefetch) {
1919      if (ArraycopySrcPrefetchDistance > 0) {
1920        __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1921      }
1922      if (ArraycopyDstPrefetchDistance > 0) {
1923        __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1924      }
1925    }
1926    __ ldx(from, 4, O4);
1927    __ ldx(from, 12, G4);
1928    __ inc(to, 16);
1929    __ inc(from, 16);
1930    __ deccc(count, 4); // Can we do next iteration after this one?
1931
1932    __ srlx(O4, 32, G3);
1933    __ bset(G3, O3);
1934    __ sllx(O4, 32, O4);
1935    __ srlx(G4, 32, G3);
1936    __ bset(G3, O4);
1937    if (use_bis) {
1938      __ stxa(O3, to, -16);
1939      __ stxa(O4, to, -8);
1940    } else {
1941      __ stx(O3, to, -16);
1942      __ stx(O4, to, -8);
1943    }
1944    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1945    __ delayed()->sllx(G4, 32,  O3);
1946
1947  }
1948
1949  //
1950  //  Generate core code for disjoint int copy (and oop copy on 32-bit).
1951  //  If "aligned" is true, the "from" and "to" addresses are assumed
1952  //  to be heapword aligned.
1953  //
1954  // Arguments:
1955  //      from:  O0
1956  //      to:    O1
1957  //      count: O2 treated as signed
1958  //
1959  void generate_disjoint_int_copy_core(bool aligned) {
1960
1961    Label L_skip_alignment, L_aligned_copy;
1962    Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
1963
1964    const Register from      = O0;   // source array address
1965    const Register to        = O1;   // destination array address
1966    const Register count     = O2;   // elements count
1967    const Register offset    = O5;   // offset from start of arrays
1968    // O3, O4, G3, G4 are used as temp registers
1969
1970    // 'aligned' == true when it is known statically during compilation
1971    // of this arraycopy call site that both 'from' and 'to' addresses
1972    // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1973    //
1974    // Aligned arrays have 4 bytes alignment in 32-bits VM
1975    // and 8 bytes - in 64-bits VM.
1976    //
1977#ifdef _LP64
1978    if (!aligned)
1979#endif
1980    {
1981      // The next check could be put under 'ifndef' since the code in
1982      // generate_disjoint_long_copy_core() has own checks and set 'offset'.
1983
1984      // for short arrays, just do single element copy
1985      __ cmp(count, 5); // 4 + 1 (20 bytes)
1986      __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
1987      __ delayed()->mov(G0, offset);
1988
1989      // copy 1 element to align 'to' on an 8 byte boundary
1990      __ andcc(to, 7, G0);
1991      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1992      __ delayed()->ld(from, 0, O3);
1993      __ inc(from, 4);
1994      __ inc(to, 4);
1995      __ dec(count);
1996      __ st(O3, to, -4);
1997    __ BIND(L_skip_alignment);
1998
1999    // if arrays have same alignment mod 8, do 4 elements copy
2000      __ andcc(from, 7, G0);
2001      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2002      __ delayed()->ld(from, 0, O3);
2003
2004    //
2005    // Load 2 aligned 8-bytes chunks and use one from previous iteration
2006    // to form 2 aligned 8-bytes chunks to store.
2007    //
2008    // copy_16_bytes_forward_with_shift() is not used here since this
2009    // code is more optimal.
2010
2011    // copy with shift 4 elements (16 bytes) at a time
2012      __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
2013      __ sllx(O3, 32,  O3);
2014
2015      disjoint_copy_core(from, to, count, 2, 16, &StubGenerator::copy_16_bytes_loop);
2016
2017      __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2018      __ delayed()->inc(count, 4); // restore 'count'
2019
2020    __ BIND(L_aligned_copy);
2021    } // !aligned
2022
2023    // copy 4 elements (16 bytes) at a time
2024      __ and3(count, 1, G4); // Save
2025      __ srl(count, 1, count);
2026     generate_disjoint_long_copy_core(aligned);
2027      __ mov(G4, count);     // Restore
2028
2029    // copy 1 element at a time
2030    __ BIND(L_copy_4_bytes);
2031      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2032    __ BIND(L_copy_4_bytes_loop);
2033      __ ld(from, offset, O3);
2034      __ deccc(count);
2035      __ st(O3, to, offset);
2036      __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
2037      __ delayed()->inc(offset, 4);
2038    __ BIND(L_exit);
2039  }
2040
2041  //
2042  //  Generate stub for disjoint int copy.  If "aligned" is true, the
2043  //  "from" and "to" addresses are assumed to be heapword aligned.
2044  //
2045  // Arguments for generated stub:
2046  //      from:  O0
2047  //      to:    O1
2048  //      count: O2 treated as signed
2049  //
2050  address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
2051    __ align(CodeEntryAlignment);
2052    StubCodeMark mark(this, "StubRoutines", name);
2053    address start = __ pc();
2054
2055    const Register count = O2;
2056    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2057
2058    if (entry != NULL) {
2059      *entry = __ pc();
2060      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2061      BLOCK_COMMENT("Entry:");
2062    }
2063
2064    generate_disjoint_int_copy_core(aligned);
2065
2066    // O3, O4 are used as temp registers
2067    inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2068    __ retl();
2069    __ delayed()->mov(G0, O0); // return 0
2070    return start;
2071  }
2072
2073  //
2074  //  Generate core code for conjoint int copy (and oop copy on 32-bit).
2075  //  If "aligned" is true, the "from" and "to" addresses are assumed
2076  //  to be heapword aligned.
2077  //
2078  // Arguments:
2079  //      from:  O0
2080  //      to:    O1
2081  //      count: O2 treated as signed
2082  //
2083  void generate_conjoint_int_copy_core(bool aligned) {
2084    // Do reverse copy.
2085
2086    Label L_skip_alignment, L_aligned_copy;
2087    Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2088
2089    const Register from      = O0;   // source array address
2090    const Register to        = O1;   // destination array address
2091    const Register count     = O2;   // elements count
2092    const Register end_from  = from; // source array end address
2093    const Register end_to    = to;   // destination array end address
2094    // O3, O4, O5, G3 are used as temp registers
2095
2096    const Register byte_count = O3;  // bytes count to copy
2097
2098      __ sllx(count, LogBytesPerInt, byte_count);
2099      __ add(to, byte_count, end_to); // offset after last copied element
2100
2101      __ cmp(count, 5); // for short arrays, just do single element copy
2102      __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2103      __ delayed()->add(from, byte_count, end_from);
2104
2105    // copy 1 element to align 'to' on an 8 byte boundary
2106      __ andcc(end_to, 7, G0);
2107      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2108      __ delayed()->nop();
2109      __ dec(count);
2110      __ dec(end_from, 4);
2111      __ dec(end_to,   4);
2112      __ ld(end_from, 0, O4);
2113      __ st(O4, end_to, 0);
2114    __ BIND(L_skip_alignment);
2115
2116    // Check if 'end_from' and 'end_to' has the same alignment.
2117      __ andcc(end_from, 7, G0);
2118      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2119      __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
2120
2121    // copy with shift 4 elements (16 bytes) at a time
2122    //
2123    // Load 2 aligned 8-bytes chunks and use one from previous iteration
2124    // to form 2 aligned 8-bytes chunks to store.
2125    //
2126      __ ldx(end_from, -4, O3);
2127      __ align(OptoLoopAlignment);
2128    __ BIND(L_copy_16_bytes);
2129      __ ldx(end_from, -12, O4);
2130      __ deccc(count, 4);
2131      __ ldx(end_from, -20, O5);
2132      __ dec(end_to, 16);
2133      __ dec(end_from, 16);
2134      __ srlx(O3, 32, O3);
2135      __ sllx(O4, 32, G3);
2136      __ bset(G3, O3);
2137      __ stx(O3, end_to, 8);
2138      __ srlx(O4, 32, O4);
2139      __ sllx(O5, 32, G3);
2140      __ bset(O4, G3);
2141      __ stx(G3, end_to, 0);
2142      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2143      __ delayed()->mov(O5, O3);
2144
2145      __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2146      __ delayed()->inc(count, 4);
2147
2148    // copy 4 elements (16 bytes) at a time
2149      __ align(OptoLoopAlignment);
2150    __ BIND(L_aligned_copy);
2151      __ dec(end_from, 16);
2152      __ ldx(end_from, 8, O3);
2153      __ ldx(end_from, 0, O4);
2154      __ dec(end_to, 16);
2155      __ deccc(count, 4);
2156      __ stx(O3, end_to, 8);
2157      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2158      __ delayed()->stx(O4, end_to, 0);
2159      __ inc(count, 4);
2160
2161    // copy 1 element (4 bytes) at a time
2162    __ BIND(L_copy_4_bytes);
2163      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2164    __ BIND(L_copy_4_bytes_loop);
2165      __ dec(end_from, 4);
2166      __ dec(end_to, 4);
2167      __ ld(end_from, 0, O4);
2168      __ deccc(count);
2169      __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
2170      __ delayed()->st(O4, end_to, 0);
2171    __ BIND(L_exit);
2172  }
2173
2174  //
2175  //  Generate stub for conjoint int copy.  If "aligned" is true, the
2176  //  "from" and "to" addresses are assumed to be heapword aligned.
2177  //
2178  // Arguments for generated stub:
2179  //      from:  O0
2180  //      to:    O1
2181  //      count: O2 treated as signed
2182  //
2183  address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
2184                                     address *entry, const char *name) {
2185    __ align(CodeEntryAlignment);
2186    StubCodeMark mark(this, "StubRoutines", name);
2187    address start = __ pc();
2188
2189    assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2190
2191    if (entry != NULL) {
2192      *entry = __ pc();
2193      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2194      BLOCK_COMMENT("Entry:");
2195    }
2196
2197    array_overlap_test(nooverlap_target, 2);
2198
2199    generate_conjoint_int_copy_core(aligned);
2200
2201    // O3, O4 are used as temp registers
2202    inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2203    __ retl();
2204    __ delayed()->mov(G0, O0); // return 0
2205    return start;
2206  }
2207
2208  //
2209  // Helper methods for generate_disjoint_long_copy_core()
2210  //
2211  void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
2212                          Label& L_loop, bool use_prefetch, bool use_bis) {
2213    __ align(OptoLoopAlignment);
2214    __ BIND(L_loop);
2215    for (int off = 0; off < 64; off += 16) {
2216      if (use_prefetch && (off & 31) == 0) {
2217        if (ArraycopySrcPrefetchDistance > 0) {
2218          __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads);
2219        }
2220        if (ArraycopyDstPrefetchDistance > 0) {
2221          __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads);
2222        }
2223      }
2224      __ ldx(from,  off+0, O4);
2225      __ ldx(from,  off+8, O5);
2226      if (use_bis) {
2227        __ stxa(O4, to,  off+0);
2228        __ stxa(O5, to,  off+8);
2229      } else {
2230        __ stx(O4, to,  off+0);
2231        __ stx(O5, to,  off+8);
2232      }
2233    }
2234    __ deccc(count, 8);
2235    __ inc(from, 64);
2236    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2237    __ delayed()->inc(to, 64);
2238  }
2239
2240  //
2241  //  Generate core code for disjoint long copy (and oop copy on 64-bit).
2242  //  "aligned" is ignored, because we must make the stronger
2243  //  assumption that both addresses are always 64-bit aligned.
2244  //
2245  // Arguments:
2246  //      from:  O0
2247  //      to:    O1
2248  //      count: O2 treated as signed
2249  //
2250  // count -= 2;
2251  // if ( count >= 0 ) { // >= 2 elements
2252  //   if ( count > 6) { // >= 8 elements
2253  //     count -= 6; // original count - 8
2254  //     do {
2255  //       copy_8_elements;
2256  //       count -= 8;
2257  //     } while ( count >= 0 );
2258  //     count += 6;
2259  //   }
2260  //   if ( count >= 0 ) { // >= 2 elements
2261  //     do {
2262  //       copy_2_elements;
2263  //     } while ( (count=count-2) >= 0 );
2264  //   }
2265  // }
2266  // count += 2;
2267  // if ( count != 0 ) { // 1 element left
2268  //   copy_1_element;
2269  // }
2270  //
2271  void generate_disjoint_long_copy_core(bool aligned) {
2272    Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2273    const Register from    = O0;  // source array address
2274    const Register to      = O1;  // destination array address
2275    const Register count   = O2;  // elements count
2276    const Register offset0 = O4;  // element offset
2277    const Register offset8 = O5;  // next element offset
2278
2279    __ deccc(count, 2);
2280    __ mov(G0, offset0);   // offset from start of arrays (0)
2281    __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2282    __ delayed()->add(offset0, 8, offset8);
2283
2284    // Copy by 64 bytes chunks
2285
2286    const Register from64 = O3;  // source address
2287    const Register to64   = G3;  // destination address
2288    __ subcc(count, 6, O3);
2289    __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2290    __ delayed()->mov(to,   to64);
2291    // Now we can use O4(offset0), O5(offset8) as temps
2292    __ mov(O3, count);
2293    // count >= 0 (original count - 8)
2294    __ mov(from, from64);
2295
2296    disjoint_copy_core(from64, to64, count, 3, 64, &StubGenerator::copy_64_bytes_loop);
2297
2298      // Restore O4(offset0), O5(offset8)
2299      __ sub(from64, from, offset0);
2300      __ inccc(count, 6); // restore count
2301      __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2302      __ delayed()->add(offset0, 8, offset8);
2303
2304      // Copy by 16 bytes chunks
2305      __ align(OptoLoopAlignment);
2306    __ BIND(L_copy_16_bytes);
2307      __ ldx(from, offset0, O3);
2308      __ ldx(from, offset8, G3);
2309      __ deccc(count, 2);
2310      __ stx(O3, to, offset0);
2311      __ inc(offset0, 16);
2312      __ stx(G3, to, offset8);
2313      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2314      __ delayed()->inc(offset8, 16);
2315
2316      // Copy last 8 bytes
2317    __ BIND(L_copy_8_bytes);
2318      __ inccc(count, 2);
2319      __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2320      __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
2321      __ ldx(from, offset0, O3);
2322      __ stx(O3, to, offset0);
2323    __ BIND(L_exit);
2324  }
2325
2326  //
2327  //  Generate stub for disjoint long copy.
2328  //  "aligned" is ignored, because we must make the stronger
2329  //  assumption that both addresses are always 64-bit aligned.
2330  //
2331  // Arguments for generated stub:
2332  //      from:  O0
2333  //      to:    O1
2334  //      count: O2 treated as signed
2335  //
2336  address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
2337    __ align(CodeEntryAlignment);
2338    StubCodeMark mark(this, "StubRoutines", name);
2339    address start = __ pc();
2340
2341    assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2342
2343    if (entry != NULL) {
2344      *entry = __ pc();
2345      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2346      BLOCK_COMMENT("Entry:");
2347    }
2348
2349    generate_disjoint_long_copy_core(aligned);
2350
2351    // O3, O4 are used as temp registers
2352    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2353    __ retl();
2354    __ delayed()->mov(G0, O0); // return 0
2355    return start;
2356  }
2357
2358  //
2359  //  Generate core code for conjoint long copy (and oop copy on 64-bit).
2360  //  "aligned" is ignored, because we must make the stronger
2361  //  assumption that both addresses are always 64-bit aligned.
2362  //
2363  // Arguments:
2364  //      from:  O0
2365  //      to:    O1
2366  //      count: O2 treated as signed
2367  //
2368  void generate_conjoint_long_copy_core(bool aligned) {
2369    // Do reverse copy.
2370    Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2371    const Register from    = O0;  // source array address
2372    const Register to      = O1;  // destination array address
2373    const Register count   = O2;  // elements count
2374    const Register offset8 = O4;  // element offset
2375    const Register offset0 = O5;  // previous element offset
2376
2377      __ subcc(count, 1, count);
2378      __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
2379      __ delayed()->sllx(count, LogBytesPerLong, offset8);
2380      __ sub(offset8, 8, offset0);
2381      __ align(OptoLoopAlignment);
2382    __ BIND(L_copy_16_bytes);
2383      __ ldx(from, offset8, O2);
2384      __ ldx(from, offset0, O3);
2385      __ stx(O2, to, offset8);
2386      __ deccc(offset8, 16);      // use offset8 as counter
2387      __ stx(O3, to, offset0);
2388      __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
2389      __ delayed()->dec(offset0, 16);
2390
2391    __ BIND(L_copy_8_bytes);
2392      __ brx(Assembler::negative, false, Assembler::pn, L_exit );
2393      __ delayed()->nop();
2394      __ ldx(from, 0, O3);
2395      __ stx(O3, to, 0);
2396    __ BIND(L_exit);
2397  }
2398
2399  //  Generate stub for conjoint long copy.
2400  //  "aligned" is ignored, because we must make the stronger
2401  //  assumption that both addresses are always 64-bit aligned.
2402  //
2403  // Arguments for generated stub:
2404  //      from:  O0
2405  //      to:    O1
2406  //      count: O2 treated as signed
2407  //
2408  address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
2409                                      address *entry, const char *name) {
2410    __ align(CodeEntryAlignment);
2411    StubCodeMark mark(this, "StubRoutines", name);
2412    address start = __ pc();
2413
2414    assert(aligned, "Should always be aligned");
2415
2416    assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2417
2418    if (entry != NULL) {
2419      *entry = __ pc();
2420      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2421      BLOCK_COMMENT("Entry:");
2422    }
2423
2424    array_overlap_test(nooverlap_target, 3);
2425
2426    generate_conjoint_long_copy_core(aligned);
2427
2428    // O3, O4 are used as temp registers
2429    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2430    __ retl();
2431    __ delayed()->mov(G0, O0); // return 0
2432    return start;
2433  }
2434
2435  //  Generate stub for disjoint oop copy.  If "aligned" is true, the
2436  //  "from" and "to" addresses are assumed to be heapword aligned.
2437  //
2438  // Arguments for generated stub:
2439  //      from:  O0
2440  //      to:    O1
2441  //      count: O2 treated as signed
2442  //
2443  address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
2444                                     bool dest_uninitialized = false) {
2445
2446    const Register from  = O0;  // source array address
2447    const Register to    = O1;  // destination array address
2448    const Register count = O2;  // elements count
2449
2450    __ align(CodeEntryAlignment);
2451    StubCodeMark mark(this, "StubRoutines", name);
2452    address start = __ pc();
2453
2454    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2455
2456    if (entry != NULL) {
2457      *entry = __ pc();
2458      // caller can pass a 64-bit byte count here
2459      BLOCK_COMMENT("Entry:");
2460    }
2461
2462    // save arguments for barrier generation
2463    __ mov(to, G1);
2464    __ mov(count, G5);
2465    gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2466  #ifdef _LP64
2467    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2468    if (UseCompressedOops) {
2469      generate_disjoint_int_copy_core(aligned);
2470    } else {
2471      generate_disjoint_long_copy_core(aligned);
2472    }
2473  #else
2474    generate_disjoint_int_copy_core(aligned);
2475  #endif
2476    // O0 is used as temp register
2477    gen_write_ref_array_post_barrier(G1, G5, O0);
2478
2479    // O3, O4 are used as temp registers
2480    inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2481    __ retl();
2482    __ delayed()->mov(G0, O0); // return 0
2483    return start;
2484  }
2485
2486  //  Generate stub for conjoint oop copy.  If "aligned" is true, the
2487  //  "from" and "to" addresses are assumed to be heapword aligned.
2488  //
2489  // Arguments for generated stub:
2490  //      from:  O0
2491  //      to:    O1
2492  //      count: O2 treated as signed
2493  //
2494  address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
2495                                     address *entry, const char *name,
2496                                     bool dest_uninitialized = false) {
2497
2498    const Register from  = O0;  // source array address
2499    const Register to    = O1;  // destination array address
2500    const Register count = O2;  // elements count
2501
2502    __ align(CodeEntryAlignment);
2503    StubCodeMark mark(this, "StubRoutines", name);
2504    address start = __ pc();
2505
2506    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2507
2508    if (entry != NULL) {
2509      *entry = __ pc();
2510      // caller can pass a 64-bit byte count here
2511      BLOCK_COMMENT("Entry:");
2512    }
2513
2514    array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
2515
2516    // save arguments for barrier generation
2517    __ mov(to, G1);
2518    __ mov(count, G5);
2519    gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2520
2521  #ifdef _LP64
2522    if (UseCompressedOops) {
2523      generate_conjoint_int_copy_core(aligned);
2524    } else {
2525      generate_conjoint_long_copy_core(aligned);
2526    }
2527  #else
2528    generate_conjoint_int_copy_core(aligned);
2529  #endif
2530
2531    // O0 is used as temp register
2532    gen_write_ref_array_post_barrier(G1, G5, O0);
2533
2534    // O3, O4 are used as temp registers
2535    inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2536    __ retl();
2537    __ delayed()->mov(G0, O0); // return 0
2538    return start;
2539  }
2540
2541
2542  // Helper for generating a dynamic type check.
2543  // Smashes only the given temp registers.
2544  void generate_type_check(Register sub_klass,
2545                           Register super_check_offset,
2546                           Register super_klass,
2547                           Register temp,
2548                           Label& L_success) {
2549    assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
2550
2551    BLOCK_COMMENT("type_check:");
2552
2553    Label L_miss, L_pop_to_miss;
2554
2555    assert_clean_int(super_check_offset, temp);
2556
2557    __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
2558                                     &L_success, &L_miss, NULL,
2559                                     super_check_offset);
2560
2561    BLOCK_COMMENT("type_check_slow_path:");
2562    __ save_frame(0);
2563    __ check_klass_subtype_slow_path(sub_klass->after_save(),
2564                                     super_klass->after_save(),
2565                                     L0, L1, L2, L4,
2566                                     NULL, &L_pop_to_miss);
2567    __ ba(L_success);
2568    __ delayed()->restore();
2569
2570    __ bind(L_pop_to_miss);
2571    __ restore();
2572
2573    // Fall through on failure!
2574    __ BIND(L_miss);
2575  }
2576
2577
2578  //  Generate stub for checked oop copy.
2579  //
2580  // Arguments for generated stub:
2581  //      from:  O0
2582  //      to:    O1
2583  //      count: O2 treated as signed
2584  //      ckoff: O3 (super_check_offset)
2585  //      ckval: O4 (super_klass)
2586  //      ret:   O0 zero for success; (-1^K) where K is partial transfer count
2587  //
2588  address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
2589
2590    const Register O0_from   = O0;      // source array address
2591    const Register O1_to     = O1;      // destination array address
2592    const Register O2_count  = O2;      // elements count
2593    const Register O3_ckoff  = O3;      // super_check_offset
2594    const Register O4_ckval  = O4;      // super_klass
2595
2596    const Register O5_offset = O5;      // loop var, with stride wordSize
2597    const Register G1_remain = G1;      // loop var, with stride -1
2598    const Register G3_oop    = G3;      // actual oop copied
2599    const Register G4_klass  = G4;      // oop._klass
2600    const Register G5_super  = G5;      // oop._klass._primary_supers[ckval]
2601
2602    __ align(CodeEntryAlignment);
2603    StubCodeMark mark(this, "StubRoutines", name);
2604    address start = __ pc();
2605
2606#ifdef ASSERT
2607    // We sometimes save a frame (see generate_type_check below).
2608    // If this will cause trouble, let's fail now instead of later.
2609    __ save_frame(0);
2610    __ restore();
2611#endif
2612
2613    assert_clean_int(O2_count, G1);     // Make sure 'count' is clean int.
2614
2615#ifdef ASSERT
2616    // caller guarantees that the arrays really are different
2617    // otherwise, we would have to make conjoint checks
2618    { Label L;
2619      __ mov(O3, G1);           // spill: overlap test smashes O3
2620      __ mov(O4, G4);           // spill: overlap test smashes O4
2621      array_overlap_test(L, LogBytesPerHeapOop);
2622      __ stop("checkcast_copy within a single array");
2623      __ bind(L);
2624      __ mov(G1, O3);
2625      __ mov(G4, O4);
2626    }
2627#endif //ASSERT
2628
2629    if (entry != NULL) {
2630      *entry = __ pc();
2631      // caller can pass a 64-bit byte count here (from generic stub)
2632      BLOCK_COMMENT("Entry:");
2633    }
2634    gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized);
2635
2636    Label load_element, store_element, do_card_marks, fail, done;
2637    __ addcc(O2_count, 0, G1_remain);   // initialize loop index, and test it
2638    __ brx(Assembler::notZero, false, Assembler::pt, load_element);
2639    __ delayed()->mov(G0, O5_offset);   // offset from start of arrays
2640
2641    // Empty array:  Nothing to do.
2642    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2643    __ retl();
2644    __ delayed()->set(0, O0);           // return 0 on (trivial) success
2645
2646    // ======== begin loop ========
2647    // (Loop is rotated; its entry is load_element.)
2648    // Loop variables:
2649    //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
2650    //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
2651    //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
2652    __ align(OptoLoopAlignment);
2653
2654    __ BIND(store_element);
2655    __ deccc(G1_remain);                // decrement the count
2656    __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
2657    __ inc(O5_offset, heapOopSize);     // step to next offset
2658    __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
2659    __ delayed()->set(0, O0);           // return -1 on success
2660
2661    // ======== loop entry is here ========
2662    __ BIND(load_element);
2663    __ load_heap_oop(O0_from, O5_offset, G3_oop);  // load the oop
2664    __ br_null_short(G3_oop, Assembler::pt, store_element);
2665
2666    __ load_klass(G3_oop, G4_klass); // query the object klass
2667
2668    generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
2669                        // branch to this on success:
2670                        store_element);
2671    // ======== end loop ========
2672
2673    // It was a real error; we must depend on the caller to finish the job.
2674    // Register G1 has number of *remaining* oops, O2 number of *total* oops.
2675    // Emit GC store barriers for the oops we have copied (O2 minus G1),
2676    // and report their number to the caller.
2677    __ BIND(fail);
2678    __ subcc(O2_count, G1_remain, O2_count);
2679    __ brx(Assembler::zero, false, Assembler::pt, done);
2680    __ delayed()->not1(O2_count, O0);   // report (-1^K) to caller
2681
2682    __ BIND(do_card_marks);
2683    gen_write_ref_array_post_barrier(O1_to, O2_count, O3);   // store check on O1[0..O2]
2684
2685    __ BIND(done);
2686    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2687    __ retl();
2688    __ delayed()->nop();             // return value in 00
2689
2690    return start;
2691  }
2692
2693
2694  //  Generate 'unsafe' array copy stub
2695  //  Though just as safe as the other stubs, it takes an unscaled
2696  //  size_t argument instead of an element count.
2697  //
2698  // Arguments for generated stub:
2699  //      from:  O0
2700  //      to:    O1
2701  //      count: O2 byte count, treated as ssize_t, can be zero
2702  //
2703  // Examines the alignment of the operands and dispatches
2704  // to a long, int, short, or byte copy loop.
2705  //
2706  address generate_unsafe_copy(const char* name,
2707                               address byte_copy_entry,
2708                               address short_copy_entry,
2709                               address int_copy_entry,
2710                               address long_copy_entry) {
2711
2712    const Register O0_from   = O0;      // source array address
2713    const Register O1_to     = O1;      // destination array address
2714    const Register O2_count  = O2;      // elements count
2715
2716    const Register G1_bits   = G1;      // test copy of low bits
2717
2718    __ align(CodeEntryAlignment);
2719    StubCodeMark mark(this, "StubRoutines", name);
2720    address start = __ pc();
2721
2722    // bump this on entry, not on exit:
2723    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
2724
2725    __ or3(O0_from, O1_to, G1_bits);
2726    __ or3(O2_count,       G1_bits, G1_bits);
2727
2728    __ btst(BytesPerLong-1, G1_bits);
2729    __ br(Assembler::zero, true, Assembler::pt,
2730          long_copy_entry, relocInfo::runtime_call_type);
2731    // scale the count on the way out:
2732    __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
2733
2734    __ btst(BytesPerInt-1, G1_bits);
2735    __ br(Assembler::zero, true, Assembler::pt,
2736          int_copy_entry, relocInfo::runtime_call_type);
2737    // scale the count on the way out:
2738    __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
2739
2740    __ btst(BytesPerShort-1, G1_bits);
2741    __ br(Assembler::zero, true, Assembler::pt,
2742          short_copy_entry, relocInfo::runtime_call_type);
2743    // scale the count on the way out:
2744    __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
2745
2746    __ br(Assembler::always, false, Assembler::pt,
2747          byte_copy_entry, relocInfo::runtime_call_type);
2748    __ delayed()->nop();
2749
2750    return start;
2751  }
2752
2753
2754  // Perform range checks on the proposed arraycopy.
2755  // Kills the two temps, but nothing else.
2756  // Also, clean the sign bits of src_pos and dst_pos.
2757  void arraycopy_range_checks(Register src,     // source array oop (O0)
2758                              Register src_pos, // source position (O1)
2759                              Register dst,     // destination array oo (O2)
2760                              Register dst_pos, // destination position (O3)
2761                              Register length,  // length of copy (O4)
2762                              Register temp1, Register temp2,
2763                              Label& L_failed) {
2764    BLOCK_COMMENT("arraycopy_range_checks:");
2765
2766    //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2767
2768    const Register array_length = temp1;  // scratch
2769    const Register end_pos      = temp2;  // scratch
2770
2771    // Note:  This next instruction may be in the delay slot of a branch:
2772    __ add(length, src_pos, end_pos);  // src_pos + length
2773    __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
2774    __ cmp(end_pos, array_length);
2775    __ br(Assembler::greater, false, Assembler::pn, L_failed);
2776
2777    //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2778    __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
2779    __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
2780    __ cmp(end_pos, array_length);
2781    __ br(Assembler::greater, false, Assembler::pn, L_failed);
2782
2783    // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2784    // Move with sign extension can be used since they are positive.
2785    __ delayed()->signx(src_pos, src_pos);
2786    __ signx(dst_pos, dst_pos);
2787
2788    BLOCK_COMMENT("arraycopy_range_checks done");
2789  }
2790
2791
2792  //
2793  //  Generate generic array copy stubs
2794  //
2795  //  Input:
2796  //    O0    -  src oop
2797  //    O1    -  src_pos
2798  //    O2    -  dst oop
2799  //    O3    -  dst_pos
2800  //    O4    -  element count
2801  //
2802  //  Output:
2803  //    O0 ==  0  -  success
2804  //    O0 == -1  -  need to call System.arraycopy
2805  //
2806  address generate_generic_copy(const char *name,
2807                                address entry_jbyte_arraycopy,
2808                                address entry_jshort_arraycopy,
2809                                address entry_jint_arraycopy,
2810                                address entry_oop_arraycopy,
2811                                address entry_jlong_arraycopy,
2812                                address entry_checkcast_arraycopy) {
2813    Label L_failed, L_objArray;
2814
2815    // Input registers
2816    const Register src      = O0;  // source array oop
2817    const Register src_pos  = O1;  // source position
2818    const Register dst      = O2;  // destination array oop
2819    const Register dst_pos  = O3;  // destination position
2820    const Register length   = O4;  // elements count
2821
2822    // registers used as temp
2823    const Register G3_src_klass = G3; // source array klass
2824    const Register G4_dst_klass = G4; // destination array klass
2825    const Register G5_lh        = G5; // layout handler
2826    const Register O5_temp      = O5;
2827
2828    __ align(CodeEntryAlignment);
2829    StubCodeMark mark(this, "StubRoutines", name);
2830    address start = __ pc();
2831
2832    // bump this on entry, not on exit:
2833    inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
2834
2835    // In principle, the int arguments could be dirty.
2836    //assert_clean_int(src_pos, G1);
2837    //assert_clean_int(dst_pos, G1);
2838    //assert_clean_int(length, G1);
2839
2840    //-----------------------------------------------------------------------
2841    // Assembler stubs will be used for this call to arraycopy
2842    // if the following conditions are met:
2843    //
2844    // (1) src and dst must not be null.
2845    // (2) src_pos must not be negative.
2846    // (3) dst_pos must not be negative.
2847    // (4) length  must not be negative.
2848    // (5) src klass and dst klass should be the same and not NULL.
2849    // (6) src and dst should be arrays.
2850    // (7) src_pos + length must not exceed length of src.
2851    // (8) dst_pos + length must not exceed length of dst.
2852    BLOCK_COMMENT("arraycopy initial argument checks");
2853
2854    //  if (src == NULL) return -1;
2855    __ br_null(src, false, Assembler::pn, L_failed);
2856
2857    //  if (src_pos < 0) return -1;
2858    __ delayed()->tst(src_pos);
2859    __ br(Assembler::negative, false, Assembler::pn, L_failed);
2860    __ delayed()->nop();
2861
2862    //  if (dst == NULL) return -1;
2863    __ br_null(dst, false, Assembler::pn, L_failed);
2864
2865    //  if (dst_pos < 0) return -1;
2866    __ delayed()->tst(dst_pos);
2867    __ br(Assembler::negative, false, Assembler::pn, L_failed);
2868
2869    //  if (length < 0) return -1;
2870    __ delayed()->tst(length);
2871    __ br(Assembler::negative, false, Assembler::pn, L_failed);
2872
2873    BLOCK_COMMENT("arraycopy argument klass checks");
2874    //  get src->klass()
2875    if (UseCompressedClassPointers) {
2876      __ delayed()->nop(); // ??? not good
2877      __ load_klass(src, G3_src_klass);
2878    } else {
2879      __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
2880    }
2881
2882#ifdef ASSERT
2883    //  assert(src->klass() != NULL);
2884    BLOCK_COMMENT("assert klasses not null");
2885    { Label L_a, L_b;
2886      __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL
2887      __ bind(L_a);
2888      __ stop("broken null klass");
2889      __ bind(L_b);
2890      __ load_klass(dst, G4_dst_klass);
2891      __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
2892      __ delayed()->mov(G0, G4_dst_klass);      // scribble the temp
2893      BLOCK_COMMENT("assert done");
2894    }
2895#endif
2896
2897    // Load layout helper
2898    //
2899    //  |array_tag|     | header_size | element_type |     |log2_element_size|
2900    // 32        30    24            16              8     2                 0
2901    //
2902    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2903    //
2904
2905    int lh_offset = in_bytes(Klass::layout_helper_offset());
2906
2907    // Load 32-bits signed value. Use br() instruction with it to check icc.
2908    __ lduw(G3_src_klass, lh_offset, G5_lh);
2909
2910    if (UseCompressedClassPointers) {
2911      __ load_klass(dst, G4_dst_klass);
2912    }
2913    // Handle objArrays completely differently...
2914    juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2915    __ set(objArray_lh, O5_temp);
2916    __ cmp(G5_lh,       O5_temp);
2917    __ br(Assembler::equal, false, Assembler::pt, L_objArray);
2918    if (UseCompressedClassPointers) {
2919      __ delayed()->nop();
2920    } else {
2921      __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
2922    }
2923
2924    //  if (src->klass() != dst->klass()) return -1;
2925    __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed);
2926
2927    //  if (!src->is_Array()) return -1;
2928    __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
2929    __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
2930
2931    // At this point, it is known to be a typeArray (array_tag 0x3).
2932#ifdef ASSERT
2933    __ delayed()->nop();
2934    { Label L;
2935      jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2936      __ set(lh_prim_tag_in_place, O5_temp);
2937      __ cmp(G5_lh,                O5_temp);
2938      __ br(Assembler::greaterEqual, false, Assembler::pt, L);
2939      __ delayed()->nop();
2940      __ stop("must be a primitive array");
2941      __ bind(L);
2942    }
2943#else
2944    __ delayed();                               // match next insn to prev branch
2945#endif
2946
2947    arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2948                           O5_temp, G4_dst_klass, L_failed);
2949
2950    // TypeArrayKlass
2951    //
2952    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2953    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2954    //
2955
2956    const Register G4_offset = G4_dst_klass;    // array offset
2957    const Register G3_elsize = G3_src_klass;    // log2 element size
2958
2959    __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
2960    __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
2961    __ add(src, G4_offset, src);       // src array offset
2962    __ add(dst, G4_offset, dst);       // dst array offset
2963    __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
2964
2965    // next registers should be set before the jump to corresponding stub
2966    const Register from     = O0;  // source array address
2967    const Register to       = O1;  // destination array address
2968    const Register count    = O2;  // elements count
2969
2970    // 'from', 'to', 'count' registers should be set in this order
2971    // since they are the same as 'src', 'src_pos', 'dst'.
2972
2973    BLOCK_COMMENT("scale indexes to element size");
2974    __ sll_ptr(src_pos, G3_elsize, src_pos);
2975    __ sll_ptr(dst_pos, G3_elsize, dst_pos);
2976    __ add(src, src_pos, from);       // src_addr
2977    __ add(dst, dst_pos, to);         // dst_addr
2978
2979    BLOCK_COMMENT("choose copy loop based on element size");
2980    __ cmp(G3_elsize, 0);
2981    __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
2982    __ delayed()->signx(length, count); // length
2983
2984    __ cmp(G3_elsize, LogBytesPerShort);
2985    __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
2986    __ delayed()->signx(length, count); // length
2987
2988    __ cmp(G3_elsize, LogBytesPerInt);
2989    __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
2990    __ delayed()->signx(length, count); // length
2991#ifdef ASSERT
2992    { Label L;
2993      __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L);
2994      __ stop("must be long copy, but elsize is wrong");
2995      __ bind(L);
2996    }
2997#endif
2998    __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
2999    __ delayed()->signx(length, count); // length
3000
3001    // ObjArrayKlass
3002  __ BIND(L_objArray);
3003    // live at this point:  G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
3004
3005    Label L_plain_copy, L_checkcast_copy;
3006    //  test array classes for subtyping
3007    __ cmp(G3_src_klass, G4_dst_klass);         // usual case is exact equality
3008    __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
3009    __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
3010
3011    // Identically typed arrays can be copied without element-wise checks.
3012    arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3013                           O5_temp, G5_lh, L_failed);
3014
3015    __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3016    __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3017    __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3018    __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3019    __ add(src, src_pos, from);       // src_addr
3020    __ add(dst, dst_pos, to);         // dst_addr
3021  __ BIND(L_plain_copy);
3022    __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
3023    __ delayed()->signx(length, count); // length
3024
3025  __ BIND(L_checkcast_copy);
3026    // live at this point:  G3_src_klass, G4_dst_klass
3027    {
3028      // Before looking at dst.length, make sure dst is also an objArray.
3029      // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
3030      __ cmp(G5_lh,                    O5_temp);
3031      __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
3032
3033      // It is safe to examine both src.length and dst.length.
3034      __ delayed();                             // match next insn to prev branch
3035      arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3036                             O5_temp, G5_lh, L_failed);
3037
3038      // Marshal the base address arguments now, freeing registers.
3039      __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3040      __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3041      __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3042      __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3043      __ add(src, src_pos, from);               // src_addr
3044      __ add(dst, dst_pos, to);                 // dst_addr
3045      __ signx(length, count);                  // length (reloaded)
3046
3047      Register sco_temp = O3;                   // this register is free now
3048      assert_different_registers(from, to, count, sco_temp,
3049                                 G4_dst_klass, G3_src_klass);
3050
3051      // Generate the type check.
3052      int sco_offset = in_bytes(Klass::super_check_offset_offset());
3053      __ lduw(G4_dst_klass, sco_offset, sco_temp);
3054      generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
3055                          O5_temp, L_plain_copy);
3056
3057      // Fetch destination element klass from the ObjArrayKlass header.
3058      int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3059
3060      // the checkcast_copy loop needs two extra arguments:
3061      __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
3062      // lduw(O4, sco_offset, O3);              // sco of elem klass
3063
3064      __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
3065      __ delayed()->lduw(O4, sco_offset, O3);
3066    }
3067
3068  __ BIND(L_failed);
3069    __ retl();
3070    __ delayed()->sub(G0, 1, O0); // return -1
3071    return start;
3072  }
3073
3074  //
3075  //  Generate stub for heap zeroing.
3076  //  "to" address is aligned to jlong (8 bytes).
3077  //
3078  // Arguments for generated stub:
3079  //      to:    O0
3080  //      count: O1 treated as signed (count of HeapWord)
3081  //             count could be 0
3082  //
3083  address generate_zero_aligned_words(const char* name) {
3084    __ align(CodeEntryAlignment);
3085    StubCodeMark mark(this, "StubRoutines", name);
3086    address start = __ pc();
3087
3088    const Register to    = O0;   // source array address
3089    const Register count = O1;   // HeapWords count
3090    const Register temp  = O2;   // scratch
3091
3092    Label Ldone;
3093    __ sllx(count, LogHeapWordSize, count); // to bytes count
3094    // Use BIS for zeroing
3095    __ bis_zeroing(to, count, temp, Ldone);
3096    __ bind(Ldone);
3097    __ retl();
3098    __ delayed()->nop();
3099    return start;
3100}
3101
3102  void generate_arraycopy_stubs() {
3103    address entry;
3104    address entry_jbyte_arraycopy;
3105    address entry_jshort_arraycopy;
3106    address entry_jint_arraycopy;
3107    address entry_oop_arraycopy;
3108    address entry_jlong_arraycopy;
3109    address entry_checkcast_arraycopy;
3110
3111    //*** jbyte
3112    // Always need aligned and unaligned versions
3113    StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
3114                                                                                  "jbyte_disjoint_arraycopy");
3115    StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
3116                                                                                  &entry_jbyte_arraycopy,
3117                                                                                  "jbyte_arraycopy");
3118    StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
3119                                                                                  "arrayof_jbyte_disjoint_arraycopy");
3120    StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
3121                                                                                  "arrayof_jbyte_arraycopy");
3122
3123    //*** jshort
3124    // Always need aligned and unaligned versions
3125    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
3126                                                                                    "jshort_disjoint_arraycopy");
3127    StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
3128                                                                                    &entry_jshort_arraycopy,
3129                                                                                    "jshort_arraycopy");
3130    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
3131                                                                                    "arrayof_jshort_disjoint_arraycopy");
3132    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
3133                                                                                    "arrayof_jshort_arraycopy");
3134
3135    //*** jint
3136    // Aligned versions
3137    StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
3138                                                                                "arrayof_jint_disjoint_arraycopy");
3139    StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
3140                                                                                "arrayof_jint_arraycopy");
3141#ifdef _LP64
3142    // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3143    // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
3144    StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
3145                                                                                "jint_disjoint_arraycopy");
3146    StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
3147                                                                                &entry_jint_arraycopy,
3148                                                                                "jint_arraycopy");
3149#else
3150    // In 32 bit jints are always HeapWordSize aligned, so always use the aligned version
3151    // (in fact in 32bit we always have a pre-loop part even in the aligned version,
3152    //  because it uses 64-bit loads/stores, so the aligned flag is actually ignored).
3153    StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy;
3154    StubRoutines::_jint_arraycopy          = StubRoutines::_arrayof_jint_arraycopy;
3155#endif
3156
3157
3158    //*** jlong
3159    // It is always aligned
3160    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
3161                                                                                  "arrayof_jlong_disjoint_arraycopy");
3162    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
3163                                                                                  "arrayof_jlong_arraycopy");
3164    StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3165    StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
3166
3167
3168    //*** oops
3169    // Aligned versions
3170    StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, &entry,
3171                                                                                      "arrayof_oop_disjoint_arraycopy");
3172    StubRoutines::_arrayof_oop_arraycopy                 = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
3173                                                                                      "arrayof_oop_arraycopy");
3174    // Aligned versions without pre-barriers
3175    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
3176                                                                                      "arrayof_oop_disjoint_arraycopy_uninit",
3177                                                                                      /*dest_uninitialized*/true);
3178    StubRoutines::_arrayof_oop_arraycopy_uninit          = generate_conjoint_oop_copy(true, entry, NULL,
3179                                                                                      "arrayof_oop_arraycopy_uninit",
3180                                                                                      /*dest_uninitialized*/true);
3181#ifdef _LP64
3182    if (UseCompressedOops) {
3183      // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
3184      StubRoutines::_oop_disjoint_arraycopy            = generate_disjoint_oop_copy(false, &entry,
3185                                                                                    "oop_disjoint_arraycopy");
3186      StubRoutines::_oop_arraycopy                     = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
3187                                                                                    "oop_arraycopy");
3188      // Unaligned versions without pre-barriers
3189      StubRoutines::_oop_disjoint_arraycopy_uninit     = generate_disjoint_oop_copy(false, &entry,
3190                                                                                    "oop_disjoint_arraycopy_uninit",
3191                                                                                    /*dest_uninitialized*/true);
3192      StubRoutines::_oop_arraycopy_uninit              = generate_conjoint_oop_copy(false, entry, NULL,
3193                                                                                    "oop_arraycopy_uninit",
3194                                                                                    /*dest_uninitialized*/true);
3195    } else
3196#endif
3197    {
3198      // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
3199      StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3200      StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
3201      StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3202      StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
3203    }
3204
3205    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3206    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3207                                                                        /*dest_uninitialized*/true);
3208
3209    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3210                                                              entry_jbyte_arraycopy,
3211                                                              entry_jshort_arraycopy,
3212                                                              entry_jint_arraycopy,
3213                                                              entry_jlong_arraycopy);
3214    StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3215                                                               entry_jbyte_arraycopy,
3216                                                               entry_jshort_arraycopy,
3217                                                               entry_jint_arraycopy,
3218                                                               entry_oop_arraycopy,
3219                                                               entry_jlong_arraycopy,
3220                                                               entry_checkcast_arraycopy);
3221
3222    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3223    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3224    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3225    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3226    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3227    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3228
3229    if (UseBlockZeroing) {
3230      StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
3231    }
3232  }
3233
3234  address generate_aescrypt_encryptBlock() {
3235    // required since we read expanded key 'int' array starting first element without alignment considerations
3236    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3237           "the following code assumes that first element of an int array is aligned to 8 bytes");
3238    __ align(CodeEntryAlignment);
3239    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3240    Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output;
3241    address start = __ pc();
3242    Register from = O0; // source byte array
3243    Register to = O1;   // destination byte array
3244    Register key = O2;  // expanded key array
3245    const Register keylen = O4; //reg for storing expanded key array length
3246
3247    // read expanded key length
3248    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3249
3250    // Method to address arbitrary alignment for load instructions:
3251    // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary
3252    // If zero/aligned then continue with double FP load instructions
3253    // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata
3254    // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address
3255    // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address
3256    // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs
3257
3258    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3259    __ andcc(from, 7, G0);
3260    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
3261    __ delayed()->alignaddr(from, G0, from);
3262
3263    // aligned case: load input into F54-F56
3264    __ ldf(FloatRegisterImpl::D, from, 0, F54);
3265    __ ldf(FloatRegisterImpl::D, from, 8, F56);
3266    __ ba_short(L_load_expanded_key);
3267
3268    __ BIND(L_load_misaligned_input);
3269    __ ldf(FloatRegisterImpl::D, from, 0, F54);
3270    __ ldf(FloatRegisterImpl::D, from, 8, F56);
3271    __ ldf(FloatRegisterImpl::D, from, 16, F58);
3272    __ faligndata(F54, F56, F54);
3273    __ faligndata(F56, F58, F56);
3274
3275    __ BIND(L_load_expanded_key);
3276    // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed
3277    for ( int i = 0;  i <= 38; i += 2 ) {
3278      __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i));
3279    }
3280
3281    // perform cipher transformation
3282    __ fxor(FloatRegisterImpl::D, F0, F54, F54);
3283    __ fxor(FloatRegisterImpl::D, F2, F56, F56);
3284    // rounds 1 through 8
3285    for ( int i = 4;  i <= 28; i += 8 ) {
3286      __ aes_eround01(as_FloatRegister(i), F54, F56, F58);
3287      __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60);
3288      __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54);
3289      __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56);
3290    }
3291    __ aes_eround01(F36, F54, F56, F58); //round 9
3292    __ aes_eround23(F38, F54, F56, F60);
3293
3294    // 128-bit original key size
3295    __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit);
3296
3297    for ( int i = 40;  i <= 50; i += 2 ) {
3298      __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) );
3299    }
3300    __ aes_eround01(F40, F58, F60, F54); //round 10
3301    __ aes_eround23(F42, F58, F60, F56);
3302    __ aes_eround01(F44, F54, F56, F58); //round 11
3303    __ aes_eround23(F46, F54, F56, F60);
3304
3305    // 192-bit original key size
3306    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput);
3307
3308    __ ldf(FloatRegisterImpl::D, key, 208, F52);
3309    __ aes_eround01(F48, F58, F60, F54); //round 12
3310    __ aes_eround23(F50, F58, F60, F56);
3311    __ ldf(FloatRegisterImpl::D, key, 216, F46);
3312    __ ldf(FloatRegisterImpl::D, key, 224, F48);
3313    __ ldf(FloatRegisterImpl::D, key, 232, F50);
3314    __ aes_eround01(F52, F54, F56, F58); //round 13
3315    __ aes_eround23(F46, F54, F56, F60);
3316    __ ba_short(L_storeOutput);
3317
3318    __ BIND(L_doLast128bit);
3319    __ ldf(FloatRegisterImpl::D, key, 160, F48);
3320    __ ldf(FloatRegisterImpl::D, key, 168, F50);
3321
3322    __ BIND(L_storeOutput);
3323    // perform last round of encryption common for all key sizes
3324    __ aes_eround01_l(F48, F58, F60, F54); //last round
3325    __ aes_eround23_l(F50, F58, F60, F56);
3326
3327    // Method to address arbitrary alignment for store instructions:
3328    // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary
3329    // If zero/aligned then continue with double FP store instructions
3330    // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case)
3331    // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001
3332    // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case
3333    // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case.
3334    // Set GSR.align to (8-n) using alignaddr
3335    // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf
3336    // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address
3337    // Store (partial) the original first (8-n) bytes starting at the original 'dest' address
3338    // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address
3339    // We need to execute this process for both the 8-byte result values
3340
3341    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3342    __ andcc(to, 7, O5);
3343    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
3344    __ delayed()->edge8n(to, G0, O3);
3345
3346    // aligned case: store output into the destination array
3347    __ stf(FloatRegisterImpl::D, F54, to, 0);
3348    __ retl();
3349    __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8);
3350
3351    __ BIND(L_store_misaligned_output);
3352    __ add(to, 8, O4);
3353    __ mov(8, O2);
3354    __ sub(O2, O5, O2);
3355    __ alignaddr(O2, G0, O2);
3356    __ faligndata(F54, F54, F54);
3357    __ faligndata(F56, F56, F56);
3358    __ and3(to, -8, to);
3359    __ and3(O4, -8, O4);
3360    __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
3361    __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
3362    __ add(to, 8, to);
3363    __ add(O4, 8, O4);
3364    __ orn(G0, O3, O3);
3365    __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
3366    __ retl();
3367    __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
3368
3369    return start;
3370  }
3371
3372  address generate_aescrypt_decryptBlock() {
3373    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3374           "the following code assumes that first element of an int array is aligned to 8 bytes");
3375    // required since we read original key 'byte' array as well in the decryption stubs
3376    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3377           "the following code assumes that first element of a byte array is aligned to 8 bytes");
3378    __ align(CodeEntryAlignment);
3379    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3380    address start = __ pc();
3381    Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input;
3382    Label L_256bit_transform, L_common_transform, L_store_misaligned_output;
3383    Register from = O0; // source byte array
3384    Register to = O1;   // destination byte array
3385    Register key = O2;  // expanded key array
3386    Register original_key = O3;  // original key array only required during decryption
3387    const Register keylen = O4;  // reg for storing expanded key array length
3388
3389    // read expanded key array length
3390    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3391
3392    // save 'from' since we may need to recheck alignment in case of 256-bit decryption
3393    __ mov(from, G1);
3394
3395    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3396    __ andcc(from, 7, G0);
3397    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
3398    __ delayed()->alignaddr(from, G0, from);
3399
3400    // aligned case: load input into F52-F54
3401    __ ldf(FloatRegisterImpl::D, from, 0, F52);
3402    __ ldf(FloatRegisterImpl::D, from, 8, F54);
3403    __ ba_short(L_load_original_key);
3404
3405    __ BIND(L_load_misaligned_input);
3406    __ ldf(FloatRegisterImpl::D, from, 0, F52);
3407    __ ldf(FloatRegisterImpl::D, from, 8, F54);
3408    __ ldf(FloatRegisterImpl::D, from, 16, F56);
3409    __ faligndata(F52, F54, F52);
3410    __ faligndata(F54, F56, F54);
3411
3412    __ BIND(L_load_original_key);
3413    // load original key from SunJCE expanded decryption key
3414    // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
3415    for ( int i = 0;  i <= 3; i++ ) {
3416      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3417    }
3418
3419    // 256-bit original key size
3420    __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
3421
3422    // 192-bit original key size
3423    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
3424
3425    // 128-bit original key size
3426    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3427    for ( int i = 0;  i <= 36; i += 4 ) {
3428      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
3429      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
3430    }
3431
3432    // perform 128-bit key specific inverse cipher transformation
3433    __ fxor(FloatRegisterImpl::D, F42, F54, F54);
3434    __ fxor(FloatRegisterImpl::D, F40, F52, F52);
3435    __ ba_short(L_common_transform);
3436
3437    __ BIND(L_expand192bit);
3438
3439    // start loading rest of the 192-bit key
3440    __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
3441    __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
3442
3443    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3444    for ( int i = 0;  i <= 36; i += 6 ) {
3445      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
3446      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
3447      __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
3448    }
3449    __ aes_kexpand1(F42, F46, 7, F48);
3450    __ aes_kexpand2(F44, F48, F50);
3451
3452    // perform 192-bit key specific inverse cipher transformation
3453    __ fxor(FloatRegisterImpl::D, F50, F54, F54);
3454    __ fxor(FloatRegisterImpl::D, F48, F52, F52);
3455    __ aes_dround23(F46, F52, F54, F58);
3456    __ aes_dround01(F44, F52, F54, F56);
3457    __ aes_dround23(F42, F56, F58, F54);
3458    __ aes_dround01(F40, F56, F58, F52);
3459    __ ba_short(L_common_transform);
3460
3461    __ BIND(L_expand256bit);
3462
3463    // load rest of the 256-bit key
3464    for ( int i = 4;  i <= 7; i++ ) {
3465      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3466    }
3467
3468    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3469    for ( int i = 0;  i <= 40; i += 8 ) {
3470      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
3471      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
3472      __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
3473      __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
3474    }
3475    __ aes_kexpand1(F48, F54, 6, F56);
3476    __ aes_kexpand2(F50, F56, F58);
3477
3478    for ( int i = 0;  i <= 6; i += 2 ) {
3479      __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
3480    }
3481
3482    // reload original 'from' address
3483    __ mov(G1, from);
3484
3485    // re-check 8-byte alignment
3486    __ andcc(from, 7, G0);
3487    __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input);
3488    __ delayed()->alignaddr(from, G0, from);
3489
3490    // aligned case: load input into F52-F54
3491    __ ldf(FloatRegisterImpl::D, from, 0, F52);
3492    __ ldf(FloatRegisterImpl::D, from, 8, F54);
3493    __ ba_short(L_256bit_transform);
3494
3495    __ BIND(L_reload_misaligned_input);
3496    __ ldf(FloatRegisterImpl::D, from, 0, F52);
3497    __ ldf(FloatRegisterImpl::D, from, 8, F54);
3498    __ ldf(FloatRegisterImpl::D, from, 16, F56);
3499    __ faligndata(F52, F54, F52);
3500    __ faligndata(F54, F56, F54);
3501
3502    // perform 256-bit key specific inverse cipher transformation
3503    __ BIND(L_256bit_transform);
3504    __ fxor(FloatRegisterImpl::D, F0, F54, F54);
3505    __ fxor(FloatRegisterImpl::D, F2, F52, F52);
3506    __ aes_dround23(F4, F52, F54, F58);
3507    __ aes_dround01(F6, F52, F54, F56);
3508    __ aes_dround23(F50, F56, F58, F54);
3509    __ aes_dround01(F48, F56, F58, F52);
3510    __ aes_dround23(F46, F52, F54, F58);
3511    __ aes_dround01(F44, F52, F54, F56);
3512    __ aes_dround23(F42, F56, F58, F54);
3513    __ aes_dround01(F40, F56, F58, F52);
3514
3515    for ( int i = 0;  i <= 7; i++ ) {
3516      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3517    }
3518
3519    // perform inverse cipher transformations common for all key sizes
3520    __ BIND(L_common_transform);
3521    for ( int i = 38;  i >= 6; i -= 8 ) {
3522      __ aes_dround23(as_FloatRegister(i), F52, F54, F58);
3523      __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56);
3524      if ( i != 6) {
3525        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54);
3526        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52);
3527      } else {
3528        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54);
3529        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52);
3530      }
3531    }
3532
3533    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3534    __ andcc(to, 7, O5);
3535    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
3536    __ delayed()->edge8n(to, G0, O3);
3537
3538    // aligned case: store output into the destination array
3539    __ stf(FloatRegisterImpl::D, F52, to, 0);
3540    __ retl();
3541    __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8);
3542
3543    __ BIND(L_store_misaligned_output);
3544    __ add(to, 8, O4);
3545    __ mov(8, O2);
3546    __ sub(O2, O5, O2);
3547    __ alignaddr(O2, G0, O2);
3548    __ faligndata(F52, F52, F52);
3549    __ faligndata(F54, F54, F54);
3550    __ and3(to, -8, to);
3551    __ and3(O4, -8, O4);
3552    __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
3553    __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
3554    __ add(to, 8, to);
3555    __ add(O4, 8, O4);
3556    __ orn(G0, O3, O3);
3557    __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
3558    __ retl();
3559    __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
3560
3561    return start;
3562  }
3563
3564  address generate_cipherBlockChaining_encryptAESCrypt() {
3565    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3566           "the following code assumes that first element of an int array is aligned to 8 bytes");
3567    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3568           "the following code assumes that first element of a byte array is aligned to 8 bytes");
3569    __ align(CodeEntryAlignment);
3570    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3571    Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit;
3572    Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform;
3573    Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit;
3574    Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit;
3575    address start = __ pc();
3576    Register from = I0; // source byte array
3577    Register to = I1;   // destination byte array
3578    Register key = I2;  // expanded key array
3579    Register rvec = I3; // init vector
3580    const Register len_reg = I4; // cipher length
3581    const Register keylen = I5;  // reg for storing expanded key array length
3582
3583    __ save_frame(0);
3584    // save cipher len to return in the end
3585    __ mov(len_reg, L0);
3586
3587    // read expanded key length
3588    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3589
3590    // load initial vector, 8-byte alignment is guranteed
3591    __ ldf(FloatRegisterImpl::D, rvec, 0, F60);
3592    __ ldf(FloatRegisterImpl::D, rvec, 8, F62);
3593    // load key, 8-byte alignment is guranteed
3594    __ ldx(key,0,G1);
3595    __ ldx(key,8,G5);
3596
3597    // start loading expanded key, 8-byte alignment is guranteed
3598    for ( int i = 0, j = 16;  i <= 38; i += 2, j += 8 ) {
3599      __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3600    }
3601
3602    // 128-bit original key size
3603    __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128);
3604
3605    for ( int i = 40, j = 176;  i <= 46; i += 2, j += 8 ) {
3606      __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3607    }
3608
3609    // 192-bit original key size
3610    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192);
3611
3612    for ( int i = 48, j = 208;  i <= 54; i += 2, j += 8 ) {
3613      __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3614    }
3615
3616    // 256-bit original key size
3617    __ ba_short(L_cbcenc256);
3618
3619    __ align(OptoLoopAlignment);
3620    __ BIND(L_cbcenc128);
3621    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3622    __ andcc(from, 7, G0);
3623    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit);
3624    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3625
3626    // aligned case: load input into G3 and G4
3627    __ ldx(from,0,G3);
3628    __ ldx(from,8,G4);
3629    __ ba_short(L_128bit_transform);
3630
3631    __ BIND(L_load_misaligned_input_128bit);
3632    // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
3633    __ alignaddr(from, G0, from);
3634    __ ldf(FloatRegisterImpl::D, from, 0, F48);
3635    __ ldf(FloatRegisterImpl::D, from, 8, F50);
3636    __ ldf(FloatRegisterImpl::D, from, 16, F52);
3637    __ faligndata(F48, F50, F48);
3638    __ faligndata(F50, F52, F50);
3639    __ movdtox(F48, G3);
3640    __ movdtox(F50, G4);
3641    __ mov(L1, from);
3642
3643    __ BIND(L_128bit_transform);
3644    __ xor3(G1,G3,G3);
3645    __ xor3(G5,G4,G4);
3646    __ movxtod(G3,F56);
3647    __ movxtod(G4,F58);
3648    __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3649    __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3650
3651    // TEN_EROUNDS
3652    for ( int i = 0;  i <= 32; i += 8 ) {
3653      __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3654      __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3655      if (i != 32 ) {
3656        __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3657        __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3658      } else {
3659        __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3660        __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3661      }
3662    }
3663
3664    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3665    __ andcc(to, 7, L1);
3666    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit);
3667    __ delayed()->edge8n(to, G0, L2);
3668
3669    // aligned case: store output into the destination array
3670    __ stf(FloatRegisterImpl::D, F60, to, 0);
3671    __ stf(FloatRegisterImpl::D, F62, to, 8);
3672    __ ba_short(L_check_loop_end_128bit);
3673
3674    __ BIND(L_store_misaligned_output_128bit);
3675    __ add(to, 8, L3);
3676    __ mov(8, L4);
3677    __ sub(L4, L1, L4);
3678    __ alignaddr(L4, G0, L4);
3679    // save cipher text before circular right shift
3680    // as it needs to be stored as iv for next block (see code before next retl)
3681    __ movdtox(F60, L6);
3682    __ movdtox(F62, L7);
3683    __ faligndata(F60, F60, F60);
3684    __ faligndata(F62, F62, F62);
3685    __ mov(to, L5);
3686    __ and3(to, -8, to);
3687    __ and3(L3, -8, L3);
3688    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3689    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3690    __ add(to, 8, to);
3691    __ add(L3, 8, L3);
3692    __ orn(G0, L2, L2);
3693    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3694    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3695    __ mov(L5, to);
3696    __ movxtod(L6, F60);
3697    __ movxtod(L7, F62);
3698
3699    __ BIND(L_check_loop_end_128bit);
3700    __ add(from, 16, from);
3701    __ add(to, 16, to);
3702    __ subcc(len_reg, 16, len_reg);
3703    __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128);
3704    __ delayed()->nop();
3705    // re-init intial vector for next block, 8-byte alignment is guaranteed
3706    __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3707    __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3708    __ mov(L0, I0);
3709    __ ret();
3710    __ delayed()->restore();
3711
3712    __ align(OptoLoopAlignment);
3713    __ BIND(L_cbcenc192);
3714    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3715    __ andcc(from, 7, G0);
3716    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit);
3717    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3718
3719    // aligned case: load input into G3 and G4
3720    __ ldx(from,0,G3);
3721    __ ldx(from,8,G4);
3722    __ ba_short(L_192bit_transform);
3723
3724    __ BIND(L_load_misaligned_input_192bit);
3725    // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
3726    __ alignaddr(from, G0, from);
3727    __ ldf(FloatRegisterImpl::D, from, 0, F48);
3728    __ ldf(FloatRegisterImpl::D, from, 8, F50);
3729    __ ldf(FloatRegisterImpl::D, from, 16, F52);
3730    __ faligndata(F48, F50, F48);
3731    __ faligndata(F50, F52, F50);
3732    __ movdtox(F48, G3);
3733    __ movdtox(F50, G4);
3734    __ mov(L1, from);
3735
3736    __ BIND(L_192bit_transform);
3737    __ xor3(G1,G3,G3);
3738    __ xor3(G5,G4,G4);
3739    __ movxtod(G3,F56);
3740    __ movxtod(G4,F58);
3741    __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3742    __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3743
3744    // TWELEVE_EROUNDS
3745    for ( int i = 0;  i <= 40; i += 8 ) {
3746      __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3747      __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3748      if (i != 40 ) {
3749        __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3750        __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3751      } else {
3752        __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3753        __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3754      }
3755    }
3756
3757    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3758    __ andcc(to, 7, L1);
3759    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit);
3760    __ delayed()->edge8n(to, G0, L2);
3761
3762    // aligned case: store output into the destination array
3763    __ stf(FloatRegisterImpl::D, F60, to, 0);
3764    __ stf(FloatRegisterImpl::D, F62, to, 8);
3765    __ ba_short(L_check_loop_end_192bit);
3766
3767    __ BIND(L_store_misaligned_output_192bit);
3768    __ add(to, 8, L3);
3769    __ mov(8, L4);
3770    __ sub(L4, L1, L4);
3771    __ alignaddr(L4, G0, L4);
3772    __ movdtox(F60, L6);
3773    __ movdtox(F62, L7);
3774    __ faligndata(F60, F60, F60);
3775    __ faligndata(F62, F62, F62);
3776    __ mov(to, L5);
3777    __ and3(to, -8, to);
3778    __ and3(L3, -8, L3);
3779    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3780    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3781    __ add(to, 8, to);
3782    __ add(L3, 8, L3);
3783    __ orn(G0, L2, L2);
3784    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3785    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3786    __ mov(L5, to);
3787    __ movxtod(L6, F60);
3788    __ movxtod(L7, F62);
3789
3790    __ BIND(L_check_loop_end_192bit);
3791    __ add(from, 16, from);
3792    __ subcc(len_reg, 16, len_reg);
3793    __ add(to, 16, to);
3794    __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192);
3795    __ delayed()->nop();
3796    // re-init intial vector for next block, 8-byte alignment is guaranteed
3797    __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3798    __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3799    __ mov(L0, I0);
3800    __ ret();
3801    __ delayed()->restore();
3802
3803    __ align(OptoLoopAlignment);
3804    __ BIND(L_cbcenc256);
3805    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3806    __ andcc(from, 7, G0);
3807    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit);
3808    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3809
3810    // aligned case: load input into G3 and G4
3811    __ ldx(from,0,G3);
3812    __ ldx(from,8,G4);
3813    __ ba_short(L_256bit_transform);
3814
3815    __ BIND(L_load_misaligned_input_256bit);
3816    // cannot clobber F48, F50 and F52. F56, F58 can be used though
3817    __ alignaddr(from, G0, from);
3818    __ movdtox(F60, L2); // save F60 before overwriting
3819    __ ldf(FloatRegisterImpl::D, from, 0, F56);
3820    __ ldf(FloatRegisterImpl::D, from, 8, F58);
3821    __ ldf(FloatRegisterImpl::D, from, 16, F60);
3822    __ faligndata(F56, F58, F56);
3823    __ faligndata(F58, F60, F58);
3824    __ movdtox(F56, G3);
3825    __ movdtox(F58, G4);
3826    __ mov(L1, from);
3827    __ movxtod(L2, F60);
3828
3829    __ BIND(L_256bit_transform);
3830    __ xor3(G1,G3,G3);
3831    __ xor3(G5,G4,G4);
3832    __ movxtod(G3,F56);
3833    __ movxtod(G4,F58);
3834    __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3835    __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3836
3837    // FOURTEEN_EROUNDS
3838    for ( int i = 0;  i <= 48; i += 8 ) {
3839      __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3840      __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3841      if (i != 48 ) {
3842        __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3843        __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3844      } else {
3845        __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3846        __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3847      }
3848    }
3849
3850    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3851    __ andcc(to, 7, L1);
3852    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit);
3853    __ delayed()->edge8n(to, G0, L2);
3854
3855    // aligned case: store output into the destination array
3856    __ stf(FloatRegisterImpl::D, F60, to, 0);
3857    __ stf(FloatRegisterImpl::D, F62, to, 8);
3858    __ ba_short(L_check_loop_end_256bit);
3859
3860    __ BIND(L_store_misaligned_output_256bit);
3861    __ add(to, 8, L3);
3862    __ mov(8, L4);
3863    __ sub(L4, L1, L4);
3864    __ alignaddr(L4, G0, L4);
3865    __ movdtox(F60, L6);
3866    __ movdtox(F62, L7);
3867    __ faligndata(F60, F60, F60);
3868    __ faligndata(F62, F62, F62);
3869    __ mov(to, L5);
3870    __ and3(to, -8, to);
3871    __ and3(L3, -8, L3);
3872    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3873    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3874    __ add(to, 8, to);
3875    __ add(L3, 8, L3);
3876    __ orn(G0, L2, L2);
3877    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3878    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3879    __ mov(L5, to);
3880    __ movxtod(L6, F60);
3881    __ movxtod(L7, F62);
3882
3883    __ BIND(L_check_loop_end_256bit);
3884    __ add(from, 16, from);
3885    __ subcc(len_reg, 16, len_reg);
3886    __ add(to, 16, to);
3887    __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256);
3888    __ delayed()->nop();
3889    // re-init intial vector for next block, 8-byte alignment is guaranteed
3890    __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3891    __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3892    __ mov(L0, I0);
3893    __ ret();
3894    __ delayed()->restore();
3895
3896    return start;
3897  }
3898
3899  address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3900    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3901           "the following code assumes that first element of an int array is aligned to 8 bytes");
3902    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3903           "the following code assumes that first element of a byte array is aligned to 8 bytes");
3904    __ align(CodeEntryAlignment);
3905    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3906    Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start;
3907    Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256;
3908    Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128;
3909    Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256;
3910    Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128;
3911    Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192;
3912    Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256;
3913    address start = __ pc();
3914    Register from = I0; // source byte array
3915    Register to = I1;   // destination byte array
3916    Register key = I2;  // expanded key array
3917    Register rvec = I3; // init vector
3918    const Register len_reg = I4; // cipher length
3919    const Register original_key = I5;  // original key array only required during decryption
3920    const Register keylen = L6;  // reg for storing expanded key array length
3921
3922    __ save_frame(0); //args are read from I* registers since we save the frame in the beginning
3923    // save cipher len to return in the end
3924    __ mov(len_reg, L7);
3925
3926    // load original key from SunJCE expanded decryption key
3927    // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
3928    for ( int i = 0;  i <= 3; i++ ) {
3929      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3930    }
3931
3932    // load initial vector, 8-byte alignment is guaranteed
3933    __ ldx(rvec,0,L0);
3934    __ ldx(rvec,8,L1);
3935
3936    // read expanded key array length
3937    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3938
3939    // 256-bit original key size
3940    __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
3941
3942    // 192-bit original key size
3943    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
3944
3945    // 128-bit original key size
3946    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3947    for ( int i = 0;  i <= 36; i += 4 ) {
3948      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
3949      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
3950    }
3951
3952    // load expanded key[last-1] and key[last] elements
3953    __ movdtox(F40,L2);
3954    __ movdtox(F42,L3);
3955
3956    __ and3(len_reg, 16, L4);
3957    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128);
3958    __ nop();
3959
3960    __ ba_short(L_dec_first_block_start);
3961
3962    __ BIND(L_expand192bit);
3963    // load rest of the 192-bit key
3964    __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
3965    __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
3966
3967    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3968    for ( int i = 0;  i <= 36; i += 6 ) {
3969      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
3970      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
3971      __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
3972    }
3973    __ aes_kexpand1(F42, F46, 7, F48);
3974    __ aes_kexpand2(F44, F48, F50);
3975
3976    // load expanded key[last-1] and key[last] elements
3977    __ movdtox(F48,L2);
3978    __ movdtox(F50,L3);
3979
3980    __ and3(len_reg, 16, L4);
3981    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192);
3982    __ nop();
3983
3984    __ ba_short(L_dec_first_block_start);
3985
3986    __ BIND(L_expand256bit);
3987    // load rest of the 256-bit key
3988    for ( int i = 4;  i <= 7; i++ ) {
3989      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3990    }
3991
3992    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3993    for ( int i = 0;  i <= 40; i += 8 ) {
3994      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
3995      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
3996      __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
3997      __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
3998    }
3999    __ aes_kexpand1(F48, F54, 6, F56);
4000    __ aes_kexpand2(F50, F56, F58);
4001
4002    // load expanded key[last-1] and key[last] elements
4003    __ movdtox(F56,L2);
4004    __ movdtox(F58,L3);
4005
4006    __ and3(len_reg, 16, L4);
4007    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256);
4008
4009    __ BIND(L_dec_first_block_start);
4010    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4011    __ andcc(from, 7, G0);
4012    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block);
4013    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4014
4015    // aligned case: load input into L4 and L5
4016    __ ldx(from,0,L4);
4017    __ ldx(from,8,L5);
4018    __ ba_short(L_transform_first_block);
4019
4020    __ BIND(L_load_misaligned_input_first_block);
4021    __ alignaddr(from, G0, from);
4022    // F58, F60, F62 can be clobbered
4023    __ ldf(FloatRegisterImpl::D, from, 0, F58);
4024    __ ldf(FloatRegisterImpl::D, from, 8, F60);
4025    __ ldf(FloatRegisterImpl::D, from, 16, F62);
4026    __ faligndata(F58, F60, F58);
4027    __ faligndata(F60, F62, F60);
4028    __ movdtox(F58, L4);
4029    __ movdtox(F60, L5);
4030    __ mov(G1, from);
4031
4032    __ BIND(L_transform_first_block);
4033    __ xor3(L2,L4,G1);
4034    __ movxtod(G1,F60);
4035    __ xor3(L3,L5,G1);
4036    __ movxtod(G1,F62);
4037
4038    // 128-bit original key size
4039    __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128);
4040
4041    // 192-bit original key size
4042    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192);
4043
4044    __ aes_dround23(F54, F60, F62, F58);
4045    __ aes_dround01(F52, F60, F62, F56);
4046    __ aes_dround23(F50, F56, F58, F62);
4047    __ aes_dround01(F48, F56, F58, F60);
4048
4049    __ BIND(L_dec_first_block192);
4050    __ aes_dround23(F46, F60, F62, F58);
4051    __ aes_dround01(F44, F60, F62, F56);
4052    __ aes_dround23(F42, F56, F58, F62);
4053    __ aes_dround01(F40, F56, F58, F60);
4054
4055    __ BIND(L_dec_first_block128);
4056    for ( int i = 38;  i >= 6; i -= 8 ) {
4057      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4058      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4059      if ( i != 6) {
4060        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4061        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4062      } else {
4063        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
4064        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
4065      }
4066    }
4067
4068    __ movxtod(L0,F56);
4069    __ movxtod(L1,F58);
4070    __ mov(L4,L0);
4071    __ mov(L5,L1);
4072    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4073    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4074
4075    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4076    __ andcc(to, 7, G1);
4077    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block);
4078    __ delayed()->edge8n(to, G0, G2);
4079
4080    // aligned case: store output into the destination array
4081    __ stf(FloatRegisterImpl::D, F60, to, 0);
4082    __ stf(FloatRegisterImpl::D, F62, to, 8);
4083    __ ba_short(L_check_decrypt_end);
4084
4085    __ BIND(L_store_misaligned_output_first_block);
4086    __ add(to, 8, G3);
4087    __ mov(8, G4);
4088    __ sub(G4, G1, G4);
4089    __ alignaddr(G4, G0, G4);
4090    __ faligndata(F60, F60, F60);
4091    __ faligndata(F62, F62, F62);
4092    __ mov(to, G1);
4093    __ and3(to, -8, to);
4094    __ and3(G3, -8, G3);
4095    __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
4096    __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
4097    __ add(to, 8, to);
4098    __ add(G3, 8, G3);
4099    __ orn(G0, G2, G2);
4100    __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
4101    __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
4102    __ mov(G1, to);
4103
4104    __ BIND(L_check_decrypt_end);
4105    __ add(from, 16, from);
4106    __ add(to, 16, to);
4107    __ subcc(len_reg, 16, len_reg);
4108    __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end);
4109    __ delayed()->nop();
4110
4111    // 256-bit original key size
4112    __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256);
4113
4114    // 192-bit original key size
4115    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192);
4116
4117    __ align(OptoLoopAlignment);
4118    __ BIND(L_dec_next2_blocks128);
4119    __ nop();
4120
4121    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4122    __ andcc(from, 7, G0);
4123    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128);
4124    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4125
4126    // aligned case: load input into G4, G5, L4 and L5
4127    __ ldx(from,0,G4);
4128    __ ldx(from,8,G5);
4129    __ ldx(from,16,L4);
4130    __ ldx(from,24,L5);
4131    __ ba_short(L_transform_next2_blocks128);
4132
4133    __ BIND(L_load_misaligned_next2_blocks128);
4134    __ alignaddr(from, G0, from);
4135    // F40, F42, F58, F60, F62 can be clobbered
4136    __ ldf(FloatRegisterImpl::D, from, 0, F40);
4137    __ ldf(FloatRegisterImpl::D, from, 8, F42);
4138    __ ldf(FloatRegisterImpl::D, from, 16, F60);
4139    __ ldf(FloatRegisterImpl::D, from, 24, F62);
4140    __ ldf(FloatRegisterImpl::D, from, 32, F58);
4141    __ faligndata(F40, F42, F40);
4142    __ faligndata(F42, F60, F42);
4143    __ faligndata(F60, F62, F60);
4144    __ faligndata(F62, F58, F62);
4145    __ movdtox(F40, G4);
4146    __ movdtox(F42, G5);
4147    __ movdtox(F60, L4);
4148    __ movdtox(F62, L5);
4149    __ mov(G1, from);
4150
4151    __ BIND(L_transform_next2_blocks128);
4152    // F40:F42 used for first 16-bytes
4153    __ xor3(L2,G4,G1);
4154    __ movxtod(G1,F40);
4155    __ xor3(L3,G5,G1);
4156    __ movxtod(G1,F42);
4157
4158    // F60:F62 used for next 16-bytes
4159    __ xor3(L2,L4,G1);
4160    __ movxtod(G1,F60);
4161    __ xor3(L3,L5,G1);
4162    __ movxtod(G1,F62);
4163
4164    for ( int i = 38;  i >= 6; i -= 8 ) {
4165      __ aes_dround23(as_FloatRegister(i), F40, F42, F44);
4166      __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46);
4167      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4168      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4169      if (i != 6 ) {
4170        __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42);
4171        __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40);
4172        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4173        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4174      } else {
4175        __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42);
4176        __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40);
4177        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
4178        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
4179      }
4180    }
4181
4182    __ movxtod(L0,F46);
4183    __ movxtod(L1,F44);
4184    __ fxor(FloatRegisterImpl::D, F46, F40, F40);
4185    __ fxor(FloatRegisterImpl::D, F44, F42, F42);
4186
4187    __ movxtod(G4,F56);
4188    __ movxtod(G5,F58);
4189    __ mov(L4,L0);
4190    __ mov(L5,L1);
4191    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4192    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4193
4194    // For mis-aligned store of 32 bytes of result we can do:
4195    // Circular right-shift all 4 FP registers so that 'head' and 'tail'
4196    // parts that need to be stored starting at mis-aligned address are in a FP reg
4197    // the other 3 FP regs can thus be stored using regular store
4198    // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts
4199
4200    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4201    __ andcc(to, 7, G1);
4202    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128);
4203    __ delayed()->edge8n(to, G0, G2);
4204
4205    // aligned case: store output into the destination array
4206    __ stf(FloatRegisterImpl::D, F40, to, 0);
4207    __ stf(FloatRegisterImpl::D, F42, to, 8);
4208    __ stf(FloatRegisterImpl::D, F60, to, 16);
4209    __ stf(FloatRegisterImpl::D, F62, to, 24);
4210    __ ba_short(L_check_decrypt_loop_end128);
4211
4212    __ BIND(L_store_misaligned_output_next2_blocks128);
4213    __ mov(8, G4);
4214    __ sub(G4, G1, G4);
4215    __ alignaddr(G4, G0, G4);
4216    __ faligndata(F40, F42, F56); // F56 can be clobbered
4217    __ faligndata(F42, F60, F42);
4218    __ faligndata(F60, F62, F60);
4219    __ faligndata(F62, F40, F40);
4220    __ mov(to, G1);
4221    __ and3(to, -8, to);
4222    __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
4223    __ stf(FloatRegisterImpl::D, F56, to, 8);
4224    __ stf(FloatRegisterImpl::D, F42, to, 16);
4225    __ stf(FloatRegisterImpl::D, F60, to, 24);
4226    __ add(to, 32, to);
4227    __ orn(G0, G2, G2);
4228    __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
4229    __ mov(G1, to);
4230
4231    __ BIND(L_check_decrypt_loop_end128);
4232    __ add(from, 32, from);
4233    __ add(to, 32, to);
4234    __ subcc(len_reg, 32, len_reg);
4235    __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128);
4236    __ delayed()->nop();
4237    __ ba_short(L_cbcdec_end);
4238
4239    __ align(OptoLoopAlignment);
4240    __ BIND(L_dec_next2_blocks192);
4241    __ nop();
4242
4243    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4244    __ andcc(from, 7, G0);
4245    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192);
4246    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4247
4248    // aligned case: load input into G4, G5, L4 and L5
4249    __ ldx(from,0,G4);
4250    __ ldx(from,8,G5);
4251    __ ldx(from,16,L4);
4252    __ ldx(from,24,L5);
4253    __ ba_short(L_transform_next2_blocks192);
4254
4255    __ BIND(L_load_misaligned_next2_blocks192);
4256    __ alignaddr(from, G0, from);
4257    // F48, F50, F52, F60, F62 can be clobbered
4258    __ ldf(FloatRegisterImpl::D, from, 0, F48);
4259    __ ldf(FloatRegisterImpl::D, from, 8, F50);
4260    __ ldf(FloatRegisterImpl::D, from, 16, F60);
4261    __ ldf(FloatRegisterImpl::D, from, 24, F62);
4262    __ ldf(FloatRegisterImpl::D, from, 32, F52);
4263    __ faligndata(F48, F50, F48);
4264    __ faligndata(F50, F60, F50);
4265    __ faligndata(F60, F62, F60);
4266    __ faligndata(F62, F52, F62);
4267    __ movdtox(F48, G4);
4268    __ movdtox(F50, G5);
4269    __ movdtox(F60, L4);
4270    __ movdtox(F62, L5);
4271    __ mov(G1, from);
4272
4273    __ BIND(L_transform_next2_blocks192);
4274    // F48:F50 used for first 16-bytes
4275    __ xor3(L2,G4,G1);
4276    __ movxtod(G1,F48);
4277    __ xor3(L3,G5,G1);
4278    __ movxtod(G1,F50);
4279
4280    // F60:F62 used for next 16-bytes
4281    __ xor3(L2,L4,G1);
4282    __ movxtod(G1,F60);
4283    __ xor3(L3,L5,G1);
4284    __ movxtod(G1,F62);
4285
4286    for ( int i = 46;  i >= 6; i -= 8 ) {
4287      __ aes_dround23(as_FloatRegister(i), F48, F50, F52);
4288      __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54);
4289      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4290      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4291      if (i != 6 ) {
4292        __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50);
4293        __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48);
4294        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4295        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4296      } else {
4297        __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50);
4298        __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48);
4299        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
4300        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
4301      }
4302    }
4303
4304    __ movxtod(L0,F54);
4305    __ movxtod(L1,F52);
4306    __ fxor(FloatRegisterImpl::D, F54, F48, F48);
4307    __ fxor(FloatRegisterImpl::D, F52, F50, F50);
4308
4309    __ movxtod(G4,F56);
4310    __ movxtod(G5,F58);
4311    __ mov(L4,L0);
4312    __ mov(L5,L1);
4313    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4314    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4315
4316    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4317    __ andcc(to, 7, G1);
4318    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192);
4319    __ delayed()->edge8n(to, G0, G2);
4320
4321    // aligned case: store output into the destination array
4322    __ stf(FloatRegisterImpl::D, F48, to, 0);
4323    __ stf(FloatRegisterImpl::D, F50, to, 8);
4324    __ stf(FloatRegisterImpl::D, F60, to, 16);
4325    __ stf(FloatRegisterImpl::D, F62, to, 24);
4326    __ ba_short(L_check_decrypt_loop_end192);
4327
4328    __ BIND(L_store_misaligned_output_next2_blocks192);
4329    __ mov(8, G4);
4330    __ sub(G4, G1, G4);
4331    __ alignaddr(G4, G0, G4);
4332    __ faligndata(F48, F50, F56); // F56 can be clobbered
4333    __ faligndata(F50, F60, F50);
4334    __ faligndata(F60, F62, F60);
4335    __ faligndata(F62, F48, F48);
4336    __ mov(to, G1);
4337    __ and3(to, -8, to);
4338    __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
4339    __ stf(FloatRegisterImpl::D, F56, to, 8);
4340    __ stf(FloatRegisterImpl::D, F50, to, 16);
4341    __ stf(FloatRegisterImpl::D, F60, to, 24);
4342    __ add(to, 32, to);
4343    __ orn(G0, G2, G2);
4344    __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
4345    __ mov(G1, to);
4346
4347    __ BIND(L_check_decrypt_loop_end192);
4348    __ add(from, 32, from);
4349    __ add(to, 32, to);
4350    __ subcc(len_reg, 32, len_reg);
4351    __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192);
4352    __ delayed()->nop();
4353    __ ba_short(L_cbcdec_end);
4354
4355    __ align(OptoLoopAlignment);
4356    __ BIND(L_dec_next2_blocks256);
4357    __ nop();
4358
4359    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4360    __ andcc(from, 7, G0);
4361    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256);
4362    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4363
4364    // aligned case: load input into G4, G5, L4 and L5
4365    __ ldx(from,0,G4);
4366    __ ldx(from,8,G5);
4367    __ ldx(from,16,L4);
4368    __ ldx(from,24,L5);
4369    __ ba_short(L_transform_next2_blocks256);
4370
4371    __ BIND(L_load_misaligned_next2_blocks256);
4372    __ alignaddr(from, G0, from);
4373    // F0, F2, F4, F60, F62 can be clobbered
4374    __ ldf(FloatRegisterImpl::D, from, 0, F0);
4375    __ ldf(FloatRegisterImpl::D, from, 8, F2);
4376    __ ldf(FloatRegisterImpl::D, from, 16, F60);
4377    __ ldf(FloatRegisterImpl::D, from, 24, F62);
4378    __ ldf(FloatRegisterImpl::D, from, 32, F4);
4379    __ faligndata(F0, F2, F0);
4380    __ faligndata(F2, F60, F2);
4381    __ faligndata(F60, F62, F60);
4382    __ faligndata(F62, F4, F62);
4383    __ movdtox(F0, G4);
4384    __ movdtox(F2, G5);
4385    __ movdtox(F60, L4);
4386    __ movdtox(F62, L5);
4387    __ mov(G1, from);
4388
4389    __ BIND(L_transform_next2_blocks256);
4390    // F0:F2 used for first 16-bytes
4391    __ xor3(L2,G4,G1);
4392    __ movxtod(G1,F0);
4393    __ xor3(L3,G5,G1);
4394    __ movxtod(G1,F2);
4395
4396    // F60:F62 used for next 16-bytes
4397    __ xor3(L2,L4,G1);
4398    __ movxtod(G1,F60);
4399    __ xor3(L3,L5,G1);
4400    __ movxtod(G1,F62);
4401
4402    __ aes_dround23(F54, F0, F2, F4);
4403    __ aes_dround01(F52, F0, F2, F6);
4404    __ aes_dround23(F54, F60, F62, F58);
4405    __ aes_dround01(F52, F60, F62, F56);
4406    __ aes_dround23(F50, F6, F4, F2);
4407    __ aes_dround01(F48, F6, F4, F0);
4408    __ aes_dround23(F50, F56, F58, F62);
4409    __ aes_dround01(F48, F56, F58, F60);
4410    // save F48:F54 in temp registers
4411    __ movdtox(F54,G2);
4412    __ movdtox(F52,G3);
4413    __ movdtox(F50,G6);
4414    __ movdtox(F48,G1);
4415    for ( int i = 46;  i >= 14; i -= 8 ) {
4416      __ aes_dround23(as_FloatRegister(i), F0, F2, F4);
4417      __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6);
4418      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4419      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4420      __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2);
4421      __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0);
4422      __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4423      __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4424    }
4425    // init F48:F54 with F0:F6 values (original key)
4426    __ ldf(FloatRegisterImpl::D, original_key, 0, F48);
4427    __ ldf(FloatRegisterImpl::D, original_key, 8, F50);
4428    __ ldf(FloatRegisterImpl::D, original_key, 16, F52);
4429    __ ldf(FloatRegisterImpl::D, original_key, 24, F54);
4430    __ aes_dround23(F54, F0, F2, F4);
4431    __ aes_dround01(F52, F0, F2, F6);
4432    __ aes_dround23(F54, F60, F62, F58);
4433    __ aes_dround01(F52, F60, F62, F56);
4434    __ aes_dround23_l(F50, F6, F4, F2);
4435    __ aes_dround01_l(F48, F6, F4, F0);
4436    __ aes_dround23_l(F50, F56, F58, F62);
4437    __ aes_dround01_l(F48, F56, F58, F60);
4438    // re-init F48:F54 with their original values
4439    __ movxtod(G2,F54);
4440    __ movxtod(G3,F52);
4441    __ movxtod(G6,F50);
4442    __ movxtod(G1,F48);
4443
4444    __ movxtod(L0,F6);
4445    __ movxtod(L1,F4);
4446    __ fxor(FloatRegisterImpl::D, F6, F0, F0);
4447    __ fxor(FloatRegisterImpl::D, F4, F2, F2);
4448
4449    __ movxtod(G4,F56);
4450    __ movxtod(G5,F58);
4451    __ mov(L4,L0);
4452    __ mov(L5,L1);
4453    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4454    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4455
4456    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4457    __ andcc(to, 7, G1);
4458    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256);
4459    __ delayed()->edge8n(to, G0, G2);
4460
4461    // aligned case: store output into the destination array
4462    __ stf(FloatRegisterImpl::D, F0, to, 0);
4463    __ stf(FloatRegisterImpl::D, F2, to, 8);
4464    __ stf(FloatRegisterImpl::D, F60, to, 16);
4465    __ stf(FloatRegisterImpl::D, F62, to, 24);
4466    __ ba_short(L_check_decrypt_loop_end256);
4467
4468    __ BIND(L_store_misaligned_output_next2_blocks256);
4469    __ mov(8, G4);
4470    __ sub(G4, G1, G4);
4471    __ alignaddr(G4, G0, G4);
4472    __ faligndata(F0, F2, F56); // F56 can be clobbered
4473    __ faligndata(F2, F60, F2);
4474    __ faligndata(F60, F62, F60);
4475    __ faligndata(F62, F0, F0);
4476    __ mov(to, G1);
4477    __ and3(to, -8, to);
4478    __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
4479    __ stf(FloatRegisterImpl::D, F56, to, 8);
4480    __ stf(FloatRegisterImpl::D, F2, to, 16);
4481    __ stf(FloatRegisterImpl::D, F60, to, 24);
4482    __ add(to, 32, to);
4483    __ orn(G0, G2, G2);
4484    __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
4485    __ mov(G1, to);
4486
4487    __ BIND(L_check_decrypt_loop_end256);
4488    __ add(from, 32, from);
4489    __ add(to, 32, to);
4490    __ subcc(len_reg, 32, len_reg);
4491    __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256);
4492    __ delayed()->nop();
4493
4494    __ BIND(L_cbcdec_end);
4495    // re-init intial vector for next block, 8-byte alignment is guaranteed
4496    __ stx(L0, rvec, 0);
4497    __ stx(L1, rvec, 8);
4498    __ mov(L7, I0);
4499    __ ret();
4500    __ delayed()->restore();
4501
4502    return start;
4503  }
4504
4505  address generate_sha1_implCompress(bool multi_block, const char *name) {
4506    __ align(CodeEntryAlignment);
4507    StubCodeMark mark(this, "StubRoutines", name);
4508    address start = __ pc();
4509
4510    Label L_sha1_loop, L_sha1_unaligned_input, L_sha1_unaligned_input_loop;
4511    int i;
4512
4513    Register buf   = O0; // byte[] source+offset
4514    Register state = O1; // int[]  SHA.state
4515    Register ofs   = O2; // int    offset
4516    Register limit = O3; // int    limit
4517
4518    // load state into F0-F4
4519    for (i = 0; i < 5; i++) {
4520      __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i));
4521    }
4522
4523    __ andcc(buf, 7, G0);
4524    __ br(Assembler::notZero, false, Assembler::pn, L_sha1_unaligned_input);
4525    __ delayed()->nop();
4526
4527    __ BIND(L_sha1_loop);
4528    // load buf into F8-F22
4529    for (i = 0; i < 8; i++) {
4530      __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4531    }
4532    __ sha1();
4533    if (multi_block) {
4534      __ add(ofs, 64, ofs);
4535      __ add(buf, 64, buf);
4536      __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_loop);
4537      __ mov(ofs, O0); // to be returned
4538    }
4539
4540    // store F0-F4 into state and return
4541    for (i = 0; i < 4; i++) {
4542      __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4543    }
4544    __ retl();
4545    __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10);
4546
4547    __ BIND(L_sha1_unaligned_input);
4548    __ alignaddr(buf, G0, buf);
4549
4550    __ BIND(L_sha1_unaligned_input_loop);
4551    // load buf into F8-F22
4552    for (i = 0; i < 9; i++) {
4553      __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4554    }
4555    for (i = 0; i < 8; i++) {
4556      __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8));
4557    }
4558    __ sha1();
4559    if (multi_block) {
4560      __ add(ofs, 64, ofs);
4561      __ add(buf, 64, buf);
4562      __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_unaligned_input_loop);
4563      __ mov(ofs, O0); // to be returned
4564    }
4565
4566    // store F0-F4 into state and return
4567    for (i = 0; i < 4; i++) {
4568      __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4569    }
4570    __ retl();
4571    __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10);
4572
4573    return start;
4574  }
4575
4576  address generate_sha256_implCompress(bool multi_block, const char *name) {
4577    __ align(CodeEntryAlignment);
4578    StubCodeMark mark(this, "StubRoutines", name);
4579    address start = __ pc();
4580
4581    Label L_sha256_loop, L_sha256_unaligned_input, L_sha256_unaligned_input_loop;
4582    int i;
4583
4584    Register buf   = O0; // byte[] source+offset
4585    Register state = O1; // int[]  SHA2.state
4586    Register ofs   = O2; // int    offset
4587    Register limit = O3; // int    limit
4588
4589    // load state into F0-F7
4590    for (i = 0; i < 8; i++) {
4591      __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i));
4592    }
4593
4594    __ andcc(buf, 7, G0);
4595    __ br(Assembler::notZero, false, Assembler::pn, L_sha256_unaligned_input);
4596    __ delayed()->nop();
4597
4598    __ BIND(L_sha256_loop);
4599    // load buf into F8-F22
4600    for (i = 0; i < 8; i++) {
4601      __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4602    }
4603    __ sha256();
4604    if (multi_block) {
4605      __ add(ofs, 64, ofs);
4606      __ add(buf, 64, buf);
4607      __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_loop);
4608      __ mov(ofs, O0); // to be returned
4609    }
4610
4611    // store F0-F7 into state and return
4612    for (i = 0; i < 7; i++) {
4613      __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4614    }
4615    __ retl();
4616    __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c);
4617
4618    __ BIND(L_sha256_unaligned_input);
4619    __ alignaddr(buf, G0, buf);
4620
4621    __ BIND(L_sha256_unaligned_input_loop);
4622    // load buf into F8-F22
4623    for (i = 0; i < 9; i++) {
4624      __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4625    }
4626    for (i = 0; i < 8; i++) {
4627      __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8));
4628    }
4629    __ sha256();
4630    if (multi_block) {
4631      __ add(ofs, 64, ofs);
4632      __ add(buf, 64, buf);
4633      __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_unaligned_input_loop);
4634      __ mov(ofs, O0); // to be returned
4635    }
4636
4637    // store F0-F7 into state and return
4638    for (i = 0; i < 7; i++) {
4639      __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4640    }
4641    __ retl();
4642    __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c);
4643
4644    return start;
4645  }
4646
4647  address generate_sha512_implCompress(bool multi_block, const char *name) {
4648    __ align(CodeEntryAlignment);
4649    StubCodeMark mark(this, "StubRoutines", name);
4650    address start = __ pc();
4651
4652    Label L_sha512_loop, L_sha512_unaligned_input, L_sha512_unaligned_input_loop;
4653    int i;
4654
4655    Register buf   = O0; // byte[] source+offset
4656    Register state = O1; // long[] SHA5.state
4657    Register ofs   = O2; // int    offset
4658    Register limit = O3; // int    limit
4659
4660    // load state into F0-F14
4661    for (i = 0; i < 8; i++) {
4662      __ ldf(FloatRegisterImpl::D, state, i*8, as_FloatRegister(i*2));
4663    }
4664
4665    __ andcc(buf, 7, G0);
4666    __ br(Assembler::notZero, false, Assembler::pn, L_sha512_unaligned_input);
4667    __ delayed()->nop();
4668
4669    __ BIND(L_sha512_loop);
4670    // load buf into F16-F46
4671    for (i = 0; i < 16; i++) {
4672      __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16));
4673    }
4674    __ sha512();
4675    if (multi_block) {
4676      __ add(ofs, 128, ofs);
4677      __ add(buf, 128, buf);
4678      __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_loop);
4679      __ mov(ofs, O0); // to be returned
4680    }
4681
4682    // store F0-F14 into state and return
4683    for (i = 0; i < 7; i++) {
4684      __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8);
4685    }
4686    __ retl();
4687    __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38);
4688
4689    __ BIND(L_sha512_unaligned_input);
4690    __ alignaddr(buf, G0, buf);
4691
4692    __ BIND(L_sha512_unaligned_input_loop);
4693    // load buf into F16-F46
4694    for (i = 0; i < 17; i++) {
4695      __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16));
4696    }
4697    for (i = 0; i < 16; i++) {
4698      __ faligndata(as_FloatRegister(i*2 + 16), as_FloatRegister(i*2 + 18), as_FloatRegister(i*2 + 16));
4699    }
4700    __ sha512();
4701    if (multi_block) {
4702      __ add(ofs, 128, ofs);
4703      __ add(buf, 128, buf);
4704      __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_unaligned_input_loop);
4705      __ mov(ofs, O0); // to be returned
4706    }
4707
4708    // store F0-F14 into state and return
4709    for (i = 0; i < 7; i++) {
4710      __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8);
4711    }
4712    __ retl();
4713    __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38);
4714
4715    return start;
4716  }
4717
4718  /* Single and multi-block ghash operations */
4719  address generate_ghash_processBlocks() {
4720      __ align(CodeEntryAlignment);
4721      Label L_ghash_loop, L_aligned, L_main;
4722      StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4723      address start = __ pc();
4724
4725      Register state = I0;
4726      Register subkeyH = I1;
4727      Register data = I2;
4728      Register len = I3;
4729
4730      __ save_frame(0);
4731
4732      __ ldx(state, 0, O0);
4733      __ ldx(state, 8, O1);
4734
4735      // Loop label for multiblock operations
4736      __ BIND(L_ghash_loop);
4737
4738      // Check if 'data' is unaligned
4739      __ andcc(data, 7, G1);
4740      __ br(Assembler::zero, false, Assembler::pt, L_aligned);
4741      __ delayed()->nop();
4742
4743      Register left_shift = L1;
4744      Register right_shift = L2;
4745      Register data_ptr = L3;
4746
4747      // Get left and right shift values in bits
4748      __ sll(G1, LogBitsPerByte, left_shift);
4749      __ mov(64, right_shift);
4750      __ sub(right_shift, left_shift, right_shift);
4751
4752      // Align to read 'data'
4753      __ sub(data, G1, data_ptr);
4754
4755      // Load first 8 bytes of 'data'
4756      __ ldx(data_ptr, 0, O4);
4757      __ sllx(O4, left_shift, O4);
4758      __ ldx(data_ptr, 8, O5);
4759      __ srlx(O5, right_shift, G4);
4760      __ bset(G4, O4);
4761
4762      // Load second 8 bytes of 'data'
4763      __ sllx(O5, left_shift, O5);
4764      __ ldx(data_ptr, 16, G4);
4765      __ srlx(G4, right_shift, G4);
4766      __ ba(L_main);
4767      __ delayed()->bset(G4, O5);
4768
4769      // If 'data' is aligned, load normally
4770      __ BIND(L_aligned);
4771      __ ldx(data, 0, O4);
4772      __ ldx(data, 8, O5);
4773
4774      __ BIND(L_main);
4775      __ ldx(subkeyH, 0, O2);
4776      __ ldx(subkeyH, 8, O3);
4777
4778      __ xor3(O0, O4, O0);
4779      __ xor3(O1, O5, O1);
4780
4781      __ xmulxhi(O0, O3, G3);
4782      __ xmulx(O0, O2, O5);
4783      __ xmulxhi(O1, O2, G4);
4784      __ xmulxhi(O1, O3, G5);
4785      __ xmulx(O0, O3, G1);
4786      __ xmulx(O1, O3, G2);
4787      __ xmulx(O1, O2, O3);
4788      __ xmulxhi(O0, O2, O4);
4789
4790      __ mov(0xE1, O0);
4791      __ sllx(O0, 56, O0);
4792
4793      __ xor3(O5, G3, O5);
4794      __ xor3(O5, G4, O5);
4795      __ xor3(G5, G1, G1);
4796      __ xor3(G1, O3, G1);
4797      __ srlx(G2, 63, O1);
4798      __ srlx(G1, 63, G3);
4799      __ sllx(G2, 63, O3);
4800      __ sllx(G2, 58, O2);
4801      __ xor3(O3, O2, O2);
4802
4803      __ sllx(G1, 1, G1);
4804      __ or3(G1, O1, G1);
4805
4806      __ xor3(G1, O2, G1);
4807
4808      __ sllx(G2, 1, G2);
4809
4810      __ xmulxhi(G1, O0, O1);
4811      __ xmulx(G1, O0, O2);
4812      __ xmulxhi(G2, O0, O3);
4813      __ xmulx(G2, O0, G1);
4814
4815      __ xor3(O4, O1, O4);
4816      __ xor3(O5, O2, O5);
4817      __ xor3(O5, O3, O5);
4818
4819      __ sllx(O4, 1, O2);
4820      __ srlx(O5, 63, O3);
4821
4822      __ or3(O2, O3, O0);
4823
4824      __ sllx(O5, 1, O1);
4825      __ srlx(G1, 63, O2);
4826      __ or3(O1, O2, O1);
4827      __ xor3(O1, G3, O1);
4828
4829      __ deccc(len);
4830      __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
4831      __ delayed()->add(data, 16, data);
4832
4833      __ stx(O0, I0, 0);
4834      __ stx(O1, I0, 8);
4835
4836      __ ret();
4837      __ delayed()->restore();
4838
4839      return start;
4840  }
4841
4842  /**
4843   *  Arguments:
4844   *
4845   * Inputs:
4846   *   O0   - int   crc
4847   *   O1   - byte* buf
4848   *   O2   - int   len
4849   *   O3   - int*  table
4850   *
4851   * Output:
4852   *   O0   - int crc result
4853   */
4854  address generate_updateBytesCRC32C() {
4855    assert(UseCRC32CIntrinsics, "need CRC32C instruction");
4856
4857    __ align(CodeEntryAlignment);
4858    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4859    address start = __ pc();
4860
4861    const Register crc   = O0;  // crc
4862    const Register buf   = O1;  // source java byte array address
4863    const Register len   = O2;  // number of bytes
4864    const Register table = O3;  // byteTable
4865
4866    __ kernel_crc32c(crc, buf, len, table);
4867
4868    __ retl();
4869    __ delayed()->nop();
4870
4871    return start;
4872  }
4873
4874#define ADLER32_NUM_TEMPS 16
4875
4876  /**
4877   *  Arguments:
4878   *
4879   * Inputs:
4880   *   O0   - int   adler
4881   *   O1   - byte* buff
4882   *   O2   - int   len
4883   *
4884   * Output:
4885   *   O0   - int adler result
4886   */
4887  address generate_updateBytesAdler32() {
4888    __ align(CodeEntryAlignment);
4889    StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4890    address start = __ pc();
4891
4892    Label L_cleanup_loop, L_cleanup_loop_check;
4893    Label L_main_loop_check, L_main_loop, L_inner_loop, L_inner_loop_check;
4894    Label L_nmax_check_done;
4895
4896    // Aliases
4897    Register s1     = O0;
4898    Register s2     = O3;
4899    Register buff   = O1;
4900    Register len    = O2;
4901    Register temp[ADLER32_NUM_TEMPS] = {L0, L1, L2, L3, L4, L5, L6, L7, I0, I1, I2, I3, I4, I5, G3, I7};
4902
4903    // Max number of bytes we can process before having to take the mod
4904    // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4905    unsigned long NMAX = 0x15B0;
4906
4907    // Zero-out the upper bits of len
4908    __ clruwu(len);
4909
4910    // Create the mask 0xFFFF
4911    __ set64(0x00FFFF, O4, O5); // O5 is the temp register
4912
4913    // s1 is initialized to the lower 16 bits of adler
4914    // s2 is initialized to the upper 16 bits of adler
4915    __ srlx(O0, 16, O5); // adler >> 16
4916    __ and3(O0, O4, s1); // s1  = (adler & 0xFFFF)
4917    __ and3(O5, O4, s2); // s2  = ((adler >> 16) & 0xFFFF)
4918
4919    // The pipelined loop needs at least 16 elements for 1 iteration
4920    // It does check this, but it is more effective to skip to the cleanup loop
4921    // Setup the constant for cutoff checking
4922    __ mov(15, O4);
4923
4924    // Check if we are above the cutoff, if not go to the cleanup loop immediately
4925    __ cmp_and_br_short(len, O4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_loop_check);
4926
4927    // Free up some registers for our use
4928    for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
4929      __ movxtod(temp[i], as_FloatRegister(2*i));
4930    }
4931
4932    // Loop maintenance stuff is done at the end of the loop, so skip to there
4933    __ ba_short(L_main_loop_check);
4934
4935    __ BIND(L_main_loop);
4936
4937    // Prologue for inner loop
4938    __ ldub(buff, 0, L0);
4939    __ dec(O5);
4940
4941    for (int i = 1; i < 8; i++) {
4942      __ ldub(buff, i, temp[i]);
4943    }
4944
4945    __ inc(buff, 8);
4946
4947    // Inner loop processes 16 elements at a time, might never execute if only 16 elements
4948    // to be processed by the outter loop
4949    __ ba_short(L_inner_loop_check);
4950
4951    __ BIND(L_inner_loop);
4952
4953    for (int i = 0; i < 8; i++) {
4954      __ ldub(buff, (2*i), temp[(8+(2*i)) % ADLER32_NUM_TEMPS]);
4955      __ add(s1, temp[i], s1);
4956      __ ldub(buff, (2*i)+1, temp[(8+(2*i)+1) % ADLER32_NUM_TEMPS]);
4957      __ add(s2, s1, s2);
4958    }
4959
4960    // Original temp 0-7 used and new loads to temp 0-7 issued
4961    // temp 8-15 ready to be consumed
4962    __ add(s1, I0, s1);
4963    __ dec(O5);
4964    __ add(s2, s1, s2);
4965    __ add(s1, I1, s1);
4966    __ inc(buff, 16);
4967    __ add(s2, s1, s2);
4968
4969    for (int i = 0; i < 6; i++) {
4970      __ add(s1, temp[10+i], s1);
4971      __ add(s2, s1, s2);
4972    }
4973
4974    __ BIND(L_inner_loop_check);
4975    __ nop();
4976    __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_inner_loop);
4977
4978    // Epilogue
4979    for (int i = 0; i < 4; i++) {
4980      __ ldub(buff, (2*i), temp[8+(2*i)]);
4981      __ add(s1, temp[i], s1);
4982      __ ldub(buff, (2*i)+1, temp[8+(2*i)+1]);
4983      __ add(s2, s1, s2);
4984    }
4985
4986    __ add(s1, temp[4], s1);
4987    __ inc(buff, 8);
4988
4989    for (int i = 0; i < 11; i++) {
4990      __ add(s2, s1, s2);
4991      __ add(s1, temp[5+i], s1);
4992    }
4993
4994    __ add(s2, s1, s2);
4995
4996    // Take the mod for s1 and s2
4997    __ set64(0xFFF1, L0, L1);
4998    __ udivx(s1, L0, L1);
4999    __ udivx(s2, L0, L2);
5000    __ mulx(L0, L1, L1);
5001    __ mulx(L0, L2, L2);
5002    __ sub(s1, L1, s1);
5003    __ sub(s2, L2, s2);
5004
5005    // Make sure there is something left to process
5006    __ BIND(L_main_loop_check);
5007    __ set64(NMAX, L0, L1);
5008    // k = len < NMAX ? len : NMAX
5009    __ cmp_and_br_short(len, L0, Assembler::greaterEqualUnsigned, Assembler::pt, L_nmax_check_done);
5010    __ andn(len, 0x0F, L0); // only loop a multiple of 16 times
5011    __ BIND(L_nmax_check_done);
5012    __ mov(L0, O5);
5013    __ sub(len, L0, len); // len -= k
5014
5015    __ srlx(O5, 4, O5); // multiplies of 16
5016    __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_main_loop);
5017
5018    // Restore anything we used, take the mod one last time, combine and return
5019    // Restore any registers we saved
5020    for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
5021      __ movdtox(as_FloatRegister(2*i), temp[i]);
5022    }
5023
5024    // There might be nothing left to process
5025    __ ba_short(L_cleanup_loop_check);
5026
5027    __ BIND(L_cleanup_loop);
5028    __ ldub(buff, 0, O4); // load single byte form buffer
5029    __ inc(buff); // buff++
5030    __ add(s1, O4, s1); // s1 += *buff++;
5031    __ dec(len); // len--
5032    __ add(s1, s2, s2); // s2 += s1;
5033    __ BIND(L_cleanup_loop_check);
5034    __ nop();
5035    __ cmp_and_br_short(len, 0, Assembler::notEqual, Assembler::pt, L_cleanup_loop);
5036
5037    // Take the mod one last time
5038    __ set64(0xFFF1, O1, O2);
5039    __ udivx(s1, O1, O2);
5040    __ udivx(s2, O1, O5);
5041    __ mulx(O1, O2, O2);
5042    __ mulx(O1, O5, O5);
5043    __ sub(s1, O2, s1);
5044    __ sub(s2, O5, s2);
5045
5046    // Combine lower bits and higher bits
5047    __ sllx(s2, 16, s2); // s2 = s2 << 16
5048    __ or3(s1, s2, s1);  // adler = s2 | s1
5049    // Final return value is in O0
5050    __ retl();
5051    __ delayed()->nop();
5052
5053    return start;
5054  }
5055
5056/**
5057   *  Arguments:
5058   *
5059   * Inputs:
5060   *   O0   - int   crc
5061   *   O1   - byte* buf
5062   *   O2   - int   len
5063   *   O3   - int*  table
5064   *
5065   * Output:
5066   *   O0   - int crc result
5067   */
5068  address generate_updateBytesCRC32() {
5069    assert(UseCRC32Intrinsics, "need VIS3 instructions");
5070
5071    __ align(CodeEntryAlignment);
5072    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
5073    address start = __ pc();
5074
5075    const Register crc   = O0; // crc
5076    const Register buf   = O1; // source java byte array address
5077    const Register len   = O2; // length
5078    const Register table = O3; // crc_table address (reuse register)
5079
5080    __ kernel_crc32(crc, buf, len, table);
5081
5082    __ retl();
5083    __ delayed()->nop();
5084
5085    return start;
5086  }
5087
5088  void generate_initial() {
5089    // Generates all stubs and initializes the entry points
5090
5091    //------------------------------------------------------------------------------------------------------------------------
5092    // entry points that exist in all platforms
5093    // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
5094    //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
5095    StubRoutines::_forward_exception_entry                 = generate_forward_exception();
5096
5097    StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
5098    StubRoutines::_catch_exception_entry                   = generate_catch_exception();
5099
5100    //------------------------------------------------------------------------------------------------------------------------
5101    // entry points that are platform specific
5102    StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
5103
5104    StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
5105    StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
5106
5107#if !defined(COMPILER2) && !defined(_LP64)
5108    StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
5109    StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
5110    StubRoutines::_atomic_add_entry          = generate_atomic_add();
5111    StubRoutines::_atomic_xchg_ptr_entry     = StubRoutines::_atomic_xchg_entry;
5112    StubRoutines::_atomic_cmpxchg_ptr_entry  = StubRoutines::_atomic_cmpxchg_entry;
5113    StubRoutines::_atomic_cmpxchg_byte_entry = ShouldNotCallThisStub();
5114    StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
5115    StubRoutines::_atomic_add_ptr_entry      = StubRoutines::_atomic_add_entry;
5116#endif  // COMPILER2 !=> _LP64
5117
5118    // Build this early so it's available for the interpreter.
5119    StubRoutines::_throw_StackOverflowError_entry =
5120            generate_throw_exception("StackOverflowError throw_exception",
5121            CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
5122    StubRoutines::_throw_delayed_StackOverflowError_entry =
5123            generate_throw_exception("delayed StackOverflowError throw_exception",
5124            CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError));
5125
5126    if (UseCRC32Intrinsics) {
5127      // set table address before stub generation which use it
5128      StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table;
5129      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5130    }
5131
5132    if (UseCRC32CIntrinsics) {
5133      // set table address before stub generation which use it
5134      StubRoutines::_crc32c_table_addr = (address)StubRoutines::Sparc::_crc32c_table;
5135      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5136    }
5137  }
5138
5139
5140  void generate_all() {
5141    // Generates all stubs and initializes the entry points
5142
5143    // Generate partial_subtype_check first here since its code depends on
5144    // UseZeroBaseCompressedOops which is defined after heap initialization.
5145    StubRoutines::Sparc::_partial_subtype_check                = generate_partial_subtype_check();
5146    // These entry points require SharedInfo::stack0 to be set up in non-core builds
5147    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
5148    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
5149    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
5150
5151    // support for verify_oop (must happen after universe_init)
5152    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop_subroutine();
5153
5154    // arraycopy stubs used by compilers
5155    generate_arraycopy_stubs();
5156
5157    // Don't initialize the platform math functions since sparc
5158    // doesn't have intrinsics for these operations.
5159
5160    // Safefetch stubs.
5161    generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5162                                                       &StubRoutines::_safefetch32_fault_pc,
5163                                                       &StubRoutines::_safefetch32_continuation_pc);
5164    generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5165                                                       &StubRoutines::_safefetchN_fault_pc,
5166                                                       &StubRoutines::_safefetchN_continuation_pc);
5167
5168    // generate AES intrinsics code
5169    if (UseAESIntrinsics) {
5170      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5171      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5172      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5173      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
5174    }
5175    // generate GHASH intrinsics code
5176    if (UseGHASHIntrinsics) {
5177      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5178    }
5179
5180    // generate SHA1/SHA256/SHA512 intrinsics code
5181    if (UseSHA1Intrinsics) {
5182      StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5183      StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5184    }
5185    if (UseSHA256Intrinsics) {
5186      StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5187      StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5188    }
5189    if (UseSHA512Intrinsics) {
5190      StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
5191      StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
5192    }
5193    // generate Adler32 intrinsics code
5194    if (UseAdler32Intrinsics) {
5195      StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5196    }
5197  }
5198
5199
5200 public:
5201  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5202    // replace the standard masm with a special one:
5203    _masm = new MacroAssembler(code);
5204
5205    _stub_count = !all ? 0x100 : 0x200;
5206    if (all) {
5207      generate_all();
5208    } else {
5209      generate_initial();
5210    }
5211
5212    // make sure this stub is available for all local calls
5213    if (_atomic_add_stub.is_unbound()) {
5214      // generate a second time, if necessary
5215      (void) generate_atomic_add();
5216    }
5217  }
5218
5219
5220 private:
5221  int _stub_count;
5222  void stub_prolog(StubCodeDesc* cdesc) {
5223    # ifdef ASSERT
5224      // put extra information in the stub code, to make it more readable
5225#ifdef _LP64
5226// Write the high part of the address
5227// [RGV] Check if there is a dependency on the size of this prolog
5228      __ emit_data((intptr_t)cdesc >> 32,    relocInfo::none);
5229#endif
5230      __ emit_data((intptr_t)cdesc,    relocInfo::none);
5231      __ emit_data(++_stub_count, relocInfo::none);
5232    # endif
5233    align(true);
5234  }
5235
5236  void align(bool at_header = false) {
5237    // %%%%% move this constant somewhere else
5238    // UltraSPARC cache line size is 8 instructions:
5239    const unsigned int icache_line_size = 32;
5240    const unsigned int icache_half_line_size = 16;
5241
5242    if (at_header) {
5243      while ((intptr_t)(__ pc()) % icache_line_size != 0) {
5244        __ emit_data(0, relocInfo::none);
5245      }
5246    } else {
5247      while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
5248        __ nop();
5249      }
5250    }
5251  }
5252
5253}; // end class declaration
5254
5255void StubGenerator_generate(CodeBuffer* code, bool all) {
5256  StubGenerator g(code, all);
5257}
5258