1/*
2 * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "asm/macroAssembler.inline.hpp"
27#include "interpreter/interpreter.hpp"
28#include "nativeInst_sparc.hpp"
29#include "oops/instanceOop.hpp"
30#include "oops/method.hpp"
31#include "oops/objArrayKlass.hpp"
32#include "oops/oop.inline.hpp"
33#include "prims/methodHandles.hpp"
34#include "runtime/frame.inline.hpp"
35#include "runtime/handles.inline.hpp"
36#include "runtime/sharedRuntime.hpp"
37#include "runtime/stubCodeGenerator.hpp"
38#include "runtime/stubRoutines.hpp"
39#include "runtime/thread.inline.hpp"
40#ifdef COMPILER2
41#include "opto/runtime.hpp"
42#endif
43
44// Declaration and definition of StubGenerator (no .hpp file).
45// For a more detailed description of the stub routine structure
46// see the comment in stubRoutines.hpp.
47
48#define __ _masm->
49
50#ifdef PRODUCT
51#define BLOCK_COMMENT(str) /* nothing */
52#else
53#define BLOCK_COMMENT(str) __ block_comment(str)
54#endif
55
56#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
57
58// Note:  The register L7 is used as L7_thread_cache, and may not be used
59//        any other way within this module.
60
61
62static const Register& Lstub_temp = L2;
63
64// -------------------------------------------------------------------------------------------------------------------------
65// Stub Code definitions
66
67class StubGenerator: public StubCodeGenerator {
68 private:
69
70#ifdef PRODUCT
71#define inc_counter_np(a,b,c)
72#else
73#define inc_counter_np(counter, t1, t2) \
74  BLOCK_COMMENT("inc_counter " #counter); \
75  __ inc_counter(&counter, t1, t2);
76#endif
77
78  //----------------------------------------------------------------------------------------------------
79  // Call stubs are used to call Java from C
80
81  address generate_call_stub(address& return_pc) {
82    StubCodeMark mark(this, "StubRoutines", "call_stub");
83    address start = __ pc();
84
85    // Incoming arguments:
86    //
87    // o0         : call wrapper address
88    // o1         : result (address)
89    // o2         : result type
90    // o3         : method
91    // o4         : (interpreter) entry point
92    // o5         : parameters (address)
93    // [sp + 0x5c]: parameter size (in words)
94    // [sp + 0x60]: thread
95    //
96    // +---------------+ <--- sp + 0
97    // |               |
98    // . reg save area .
99    // |               |
100    // +---------------+ <--- sp + 0x40
101    // |               |
102    // . extra 7 slots .
103    // |               |
104    // +---------------+ <--- sp + 0x5c
105    // |  param. size  |
106    // +---------------+ <--- sp + 0x60
107    // |    thread     |
108    // +---------------+
109    // |               |
110
111    // note: if the link argument position changes, adjust
112    //       the code in frame::entry_frame_call_wrapper()
113
114    const Argument link           = Argument(0, false); // used only for GC
115    const Argument result         = Argument(1, false);
116    const Argument result_type    = Argument(2, false);
117    const Argument method         = Argument(3, false);
118    const Argument entry_point    = Argument(4, false);
119    const Argument parameters     = Argument(5, false);
120    const Argument parameter_size = Argument(6, false);
121    const Argument thread         = Argument(7, false);
122
123    // setup thread register
124    __ ld_ptr(thread.as_address(), G2_thread);
125    __ reinit_heapbase();
126
127#ifdef ASSERT
128    // make sure we have no pending exceptions
129    { const Register t = G3_scratch;
130      Label L;
131      __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
132      __ br_null_short(t, Assembler::pt, L);
133      __ stop("StubRoutines::call_stub: entered with pending exception");
134      __ bind(L);
135    }
136#endif
137
138    // create activation frame & allocate space for parameters
139    { const Register t = G3_scratch;
140      __ ld_ptr(parameter_size.as_address(), t);                // get parameter size (in words)
141      __ add(t, frame::memory_parameter_word_sp_offset, t);     // add space for save area (in words)
142      __ round_to(t, WordsPerLong);                             // make sure it is multiple of 2 (in words)
143      __ sll(t, Interpreter::logStackElementSize, t);           // compute number of bytes
144      __ neg(t);                                                // negate so it can be used with save
145      __ save(SP, t, SP);                                       // setup new frame
146    }
147
148    // +---------------+ <--- sp + 0
149    // |               |
150    // . reg save area .
151    // |               |
152    // +---------------+ <--- sp + 0x40
153    // |               |
154    // . extra 7 slots .
155    // |               |
156    // +---------------+ <--- sp + 0x5c
157    // |  empty slot   |      (only if parameter size is even)
158    // +---------------+
159    // |               |
160    // .  parameters   .
161    // |               |
162    // +---------------+ <--- fp + 0
163    // |               |
164    // . reg save area .
165    // |               |
166    // +---------------+ <--- fp + 0x40
167    // |               |
168    // . extra 7 slots .
169    // |               |
170    // +---------------+ <--- fp + 0x5c
171    // |  param. size  |
172    // +---------------+ <--- fp + 0x60
173    // |    thread     |
174    // +---------------+
175    // |               |
176
177    // pass parameters if any
178    BLOCK_COMMENT("pass parameters if any");
179    { const Register src = parameters.as_in().as_register();
180      const Register dst = Lentry_args;
181      const Register tmp = G3_scratch;
182      const Register cnt = G4_scratch;
183
184      // test if any parameters & setup of Lentry_args
185      Label exit;
186      __ ld_ptr(parameter_size.as_in().as_address(), cnt);      // parameter counter
187      __ add( FP, STACK_BIAS, dst );
188      __ cmp_zero_and_br(Assembler::zero, cnt, exit);
189      __ delayed()->sub(dst, BytesPerWord, dst);                 // setup Lentry_args
190
191      // copy parameters if any
192      Label loop;
193      __ BIND(loop);
194      // Store parameter value
195      __ ld_ptr(src, 0, tmp);
196      __ add(src, BytesPerWord, src);
197      __ st_ptr(tmp, dst, 0);
198      __ deccc(cnt);
199      __ br(Assembler::greater, false, Assembler::pt, loop);
200      __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
201
202      // done
203      __ BIND(exit);
204    }
205
206    // setup parameters, method & call Java function
207#ifdef ASSERT
208    // layout_activation_impl checks it's notion of saved SP against
209    // this register, so if this changes update it as well.
210    const Register saved_SP = Lscratch;
211    __ mov(SP, saved_SP);                               // keep track of SP before call
212#endif
213
214    // setup parameters
215    const Register t = G3_scratch;
216    __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
217    __ sll(t, Interpreter::logStackElementSize, t);    // compute number of bytes
218    __ sub(FP, t, Gargs);                              // setup parameter pointer
219    __ add( Gargs, STACK_BIAS, Gargs );                // Account for LP64 stack bias
220    __ mov(SP, O5_savedSP);
221
222
223    // do the call
224    //
225    // the following register must be setup:
226    //
227    // G2_thread
228    // G5_method
229    // Gargs
230    BLOCK_COMMENT("call Java function");
231    __ jmpl(entry_point.as_in().as_register(), G0, O7);
232    __ delayed()->mov(method.as_in().as_register(), G5_method);   // setup method
233
234    BLOCK_COMMENT("call_stub_return_address:");
235    return_pc = __ pc();
236
237    // The callee, if it wasn't interpreted, can return with SP changed so
238    // we can no longer assert of change of SP.
239
240    // store result depending on type
241    // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
242    //  is treated as T_INT)
243    { const Register addr = result     .as_in().as_register();
244      const Register type = result_type.as_in().as_register();
245      Label is_long, is_float, is_double, is_object, exit;
246      __            cmp(type, T_OBJECT);  __ br(Assembler::equal, false, Assembler::pn, is_object);
247      __ delayed()->cmp(type, T_FLOAT);   __ br(Assembler::equal, false, Assembler::pn, is_float);
248      __ delayed()->cmp(type, T_DOUBLE);  __ br(Assembler::equal, false, Assembler::pn, is_double);
249      __ delayed()->cmp(type, T_LONG);    __ br(Assembler::equal, false, Assembler::pn, is_long);
250      __ delayed()->nop();
251
252      // store int result
253      __ st(O0, addr, G0);
254
255      __ BIND(exit);
256      __ ret();
257      __ delayed()->restore();
258
259      __ BIND(is_object);
260      __ ba(exit);
261      __ delayed()->st_ptr(O0, addr, G0);
262
263      __ BIND(is_float);
264      __ ba(exit);
265      __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
266
267      __ BIND(is_double);
268      __ ba(exit);
269      __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
270
271      __ BIND(is_long);
272      __ ba(exit);
273      __ delayed()->st_long(O0, addr, G0);      // store entire long
274     }
275     return start;
276  }
277
278
279  //----------------------------------------------------------------------------------------------------
280  // Return point for a Java call if there's an exception thrown in Java code.
281  // The exception is caught and transformed into a pending exception stored in
282  // JavaThread that can be tested from within the VM.
283  //
284  // Oexception: exception oop
285
286  address generate_catch_exception() {
287    StubCodeMark mark(this, "StubRoutines", "catch_exception");
288
289    address start = __ pc();
290    // verify that thread corresponds
291    __ verify_thread();
292
293    const Register& temp_reg = Gtemp;
294    Address pending_exception_addr    (G2_thread, Thread::pending_exception_offset());
295    Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset   ());
296    Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset   ());
297
298    // set pending exception
299    __ verify_oop(Oexception);
300    __ st_ptr(Oexception, pending_exception_addr);
301    __ set((intptr_t)__FILE__, temp_reg);
302    __ st_ptr(temp_reg, exception_file_offset_addr);
303    __ set((intptr_t)__LINE__, temp_reg);
304    __ st(temp_reg, exception_line_offset_addr);
305
306    // complete return to VM
307    assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
308
309    AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
310    __ jump_to(stub_ret, temp_reg);
311    __ delayed()->nop();
312
313    return start;
314  }
315
316
317  //----------------------------------------------------------------------------------------------------
318  // Continuation point for runtime calls returning with a pending exception
319  // The pending exception check happened in the runtime or native call stub
320  // The pending exception in Thread is converted into a Java-level exception
321  //
322  // Contract with Java-level exception handler: O0 = exception
323  //                                             O1 = throwing pc
324
325  address generate_forward_exception() {
326    StubCodeMark mark(this, "StubRoutines", "forward_exception");
327    address start = __ pc();
328
329    // Upon entry, O7 has the return address returning into Java
330    // (interpreted or compiled) code; i.e. the return address
331    // becomes the throwing pc.
332
333    const Register& handler_reg = Gtemp;
334
335    Address exception_addr(G2_thread, Thread::pending_exception_offset());
336
337#ifdef ASSERT
338    // make sure that this code is only executed if there is a pending exception
339    { Label L;
340      __ ld_ptr(exception_addr, Gtemp);
341      __ br_notnull_short(Gtemp, Assembler::pt, L);
342      __ stop("StubRoutines::forward exception: no pending exception (1)");
343      __ bind(L);
344    }
345#endif
346
347    // compute exception handler into handler_reg
348    __ get_thread();
349    __ ld_ptr(exception_addr, Oexception);
350    __ verify_oop(Oexception);
351    __ save_frame(0);             // compensates for compiler weakness
352    __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
353    BLOCK_COMMENT("call exception_handler_for_return_address");
354    __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
355    __ mov(O0, handler_reg);
356    __ restore();                 // compensates for compiler weakness
357
358    __ ld_ptr(exception_addr, Oexception);
359    __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
360
361#ifdef ASSERT
362    // make sure exception is set
363    { Label L;
364      __ br_notnull_short(Oexception, Assembler::pt, L);
365      __ stop("StubRoutines::forward exception: no pending exception (2)");
366      __ bind(L);
367    }
368#endif
369    // jump to exception handler
370    __ jmp(handler_reg, 0);
371    // clear pending exception
372    __ delayed()->st_ptr(G0, exception_addr);
373
374    return start;
375  }
376
377  // Safefetch stubs.
378  void generate_safefetch(const char* name, int size, address* entry,
379                          address* fault_pc, address* continuation_pc) {
380    // safefetch signatures:
381    //   int      SafeFetch32(int*      adr, int      errValue);
382    //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
383    //
384    // arguments:
385    //   o0 = adr
386    //   o1 = errValue
387    //
388    // result:
389    //   o0  = *adr or errValue
390
391    StubCodeMark mark(this, "StubRoutines", name);
392
393    // Entry point, pc or function descriptor.
394    __ align(CodeEntryAlignment);
395    *entry = __ pc();
396
397    __ mov(O0, G1);  // g1 = o0
398    __ mov(O1, O0);  // o0 = o1
399    // Load *adr into c_rarg1, may fault.
400    *fault_pc = __ pc();
401    switch (size) {
402      case 4:
403        // int32_t
404        __ ldsw(G1, 0, O0);  // o0 = [g1]
405        break;
406      case 8:
407        // int64_t
408        __ ldx(G1, 0, O0);   // o0 = [g1]
409        break;
410      default:
411        ShouldNotReachHere();
412    }
413
414    // return errValue or *adr
415    *continuation_pc = __ pc();
416    // By convention with the trap handler we ensure there is a non-CTI
417    // instruction in the trap shadow.
418    __ nop();
419    __ retl();
420    __ delayed()->nop();
421  }
422
423  //------------------------------------------------------------------------------------------------------------------------
424  // Continuation point for throwing of implicit exceptions that are not handled in
425  // the current activation. Fabricates an exception oop and initiates normal
426  // exception dispatching in this frame. Only callee-saved registers are preserved
427  // (through the normal register window / RegisterMap handling).
428  // If the compiler needs all registers to be preserved between the fault
429  // point and the exception handler then it must assume responsibility for that in
430  // AbstractCompiler::continuation_for_implicit_null_exception or
431  // continuation_for_implicit_division_by_zero_exception. All other implicit
432  // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
433  // either at call sites or otherwise assume that stack unwinding will be initiated,
434  // so caller saved registers were assumed volatile in the compiler.
435
436  // Note that we generate only this stub into a RuntimeStub, because it needs to be
437  // properly traversed and ignored during GC, so we change the meaning of the "__"
438  // macro within this method.
439#undef __
440#define __ masm->
441
442  address generate_throw_exception(const char* name, address runtime_entry,
443                                   Register arg1 = noreg, Register arg2 = noreg) {
444#ifdef ASSERT
445    int insts_size = VerifyThread ? 1 * K : 600;
446#else
447    int insts_size = VerifyThread ? 1 * K : 256;
448#endif /* ASSERT */
449    int locs_size  = 32;
450
451    CodeBuffer      code(name, insts_size, locs_size);
452    MacroAssembler* masm = new MacroAssembler(&code);
453
454    __ verify_thread();
455
456    // This is an inlined and slightly modified version of call_VM
457    // which has the ability to fetch the return PC out of thread-local storage
458    __ assert_not_delayed();
459
460    // Note that we always push a frame because on the SPARC
461    // architecture, for all of our implicit exception kinds at call
462    // sites, the implicit exception is taken before the callee frame
463    // is pushed.
464    __ save_frame(0);
465
466    int frame_complete = __ offset();
467
468    // Note that we always have a runtime stub frame on the top of stack by this point
469    Register last_java_sp = SP;
470    // 64-bit last_java_sp is biased!
471    __ set_last_Java_frame(last_java_sp, G0);
472    if (VerifyThread)  __ mov(G2_thread, O0); // about to be smashed; pass early
473    __ save_thread(noreg);
474    if (arg1 != noreg) {
475      assert(arg2 != O1, "clobbered");
476      __ mov(arg1, O1);
477    }
478    if (arg2 != noreg) {
479      __ mov(arg2, O2);
480    }
481    // do the call
482    BLOCK_COMMENT("call runtime_entry");
483    __ call(runtime_entry, relocInfo::runtime_call_type);
484    if (!VerifyThread)
485      __ delayed()->mov(G2_thread, O0);  // pass thread as first argument
486    else
487      __ delayed()->nop();             // (thread already passed)
488    __ restore_thread(noreg);
489    __ reset_last_Java_frame();
490
491    // check for pending exceptions. use Gtemp as scratch register.
492#ifdef ASSERT
493    Label L;
494
495    Address exception_addr(G2_thread, Thread::pending_exception_offset());
496    Register scratch_reg = Gtemp;
497    __ ld_ptr(exception_addr, scratch_reg);
498    __ br_notnull_short(scratch_reg, Assembler::pt, L);
499    __ should_not_reach_here();
500    __ bind(L);
501#endif // ASSERT
502    BLOCK_COMMENT("call forward_exception_entry");
503    __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
504    // we use O7 linkage so that forward_exception_entry has the issuing PC
505    __ delayed()->restore();
506
507    RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
508    return stub->entry_point();
509  }
510
511#undef __
512#define __ _masm->
513
514
515  // Generate a routine that sets all the registers so we
516  // can tell if the stop routine prints them correctly.
517  address generate_test_stop() {
518    StubCodeMark mark(this, "StubRoutines", "test_stop");
519    address start = __ pc();
520
521    int i;
522
523    __ save_frame(0);
524
525    static jfloat zero = 0.0, one = 1.0;
526
527    // put addr in L0, then load through L0 to F0
528    __ set((intptr_t)&zero, L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F0);
529    __ set((intptr_t)&one,  L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
530
531    // use add to put 2..18 in F2..F18
532    for ( i = 2;  i <= 18;  ++i ) {
533      __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1),  as_FloatRegister(i));
534    }
535
536    // Now put double 2 in F16, double 18 in F18
537    __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
538    __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
539
540    // use add to put 20..32 in F20..F32
541    for (i = 20; i < 32; i += 2) {
542      __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2),  as_FloatRegister(i));
543    }
544
545    // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
546    for ( i = 0; i < 8; ++i ) {
547      if (i < 6) {
548        __ set(     i, as_iRegister(i));
549        __ set(16 + i, as_oRegister(i));
550        __ set(24 + i, as_gRegister(i));
551      }
552      __ set( 8 + i, as_lRegister(i));
553    }
554
555    __ stop("testing stop");
556
557
558    __ ret();
559    __ delayed()->restore();
560
561    return start;
562  }
563
564
565  address generate_stop_subroutine() {
566    StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
567    address start = __ pc();
568
569    __ stop_subroutine();
570
571    return start;
572  }
573
574  address generate_flush_callers_register_windows() {
575    StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
576    address start = __ pc();
577
578    __ flushw();
579    __ retl(false);
580    __ delayed()->add( FP, STACK_BIAS, O0 );
581    // The returned value must be a stack pointer whose register save area
582    // is flushed, and will stay flushed while the caller executes.
583
584    return start;
585  }
586
587  // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
588  //
589  // Arguments:
590  //
591  //      exchange_value: O0
592  //      dest:           O1
593  //
594  // Results:
595  //
596  //     O0: the value previously stored in dest
597  //
598  address generate_atomic_xchg() {
599    StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
600    address start = __ pc();
601
602    if (UseCASForSwap) {
603      // Use CAS instead of swap, just in case the MP hardware
604      // prefers to work with just one kind of synch. instruction.
605      Label retry;
606      __ BIND(retry);
607      __ mov(O0, O3);       // scratch copy of exchange value
608      __ ld(O1, 0, O2);     // observe the previous value
609      // try to replace O2 with O3
610      __ cas(O1, O2, O3);
611      __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
612
613      __ retl(false);
614      __ delayed()->mov(O2, O0);  // report previous value to caller
615    } else {
616      __ retl(false);
617      __ delayed()->swap(O1, 0, O0);
618    }
619
620    return start;
621  }
622
623
624  // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
625  //
626  // Arguments:
627  //
628  //      exchange_value: O0
629  //      dest:           O1
630  //      compare_value:  O2
631  //
632  // Results:
633  //
634  //     O0: the value previously stored in dest
635  //
636  address generate_atomic_cmpxchg() {
637    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
638    address start = __ pc();
639
640    // cmpxchg(dest, compare_value, exchange_value)
641    __ cas(O1, O2, O0);
642    __ retl(false);
643    __ delayed()->nop();
644
645    return start;
646  }
647
648  // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
649  //
650  // Arguments:
651  //
652  //      exchange_value: O1:O0
653  //      dest:           O2
654  //      compare_value:  O4:O3
655  //
656  // Results:
657  //
658  //     O1:O0: the value previously stored in dest
659  //
660  // Overwrites: G1,G2,G3
661  //
662  address generate_atomic_cmpxchg_long() {
663    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
664    address start = __ pc();
665
666    __ sllx(O0, 32, O0);
667    __ srl(O1, 0, O1);
668    __ or3(O0,O1,O0);      // O0 holds 64-bit value from compare_value
669    __ sllx(O3, 32, O3);
670    __ srl(O4, 0, O4);
671    __ or3(O3,O4,O3);     // O3 holds 64-bit value from exchange_value
672    __ casx(O2, O3, O0);
673    __ srl(O0, 0, O1);    // unpacked return value in O1:O0
674    __ retl(false);
675    __ delayed()->srlx(O0, 32, O0);
676
677    return start;
678  }
679
680
681  // Support for jint Atomic::add(jint add_value, volatile jint* dest).
682  //
683  // Arguments:
684  //
685  //      add_value: O0   (e.g., +1 or -1)
686  //      dest:      O1
687  //
688  // Results:
689  //
690  //     O0: the new value stored in dest
691  //
692  // Overwrites: O3
693  //
694  address generate_atomic_add() {
695    StubCodeMark mark(this, "StubRoutines", "atomic_add");
696    address start = __ pc();
697    __ BIND(_atomic_add_stub);
698
699    Label(retry);
700    __ BIND(retry);
701
702    __ lduw(O1, 0, O2);
703    __ add(O0, O2, O3);
704    __ cas(O1, O2, O3);
705    __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
706    __ retl(false);
707    __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
708
709    return start;
710  }
711  Label _atomic_add_stub;  // called from other stubs
712
713
714  // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
715  // Arguments :
716  //
717  //      ret  : O0, returned
718  //      icc/xcc: set as O0 (depending on wordSize)
719  //      sub  : O1, argument, not changed
720  //      super: O2, argument, not changed
721  //      raddr: O7, blown by call
722  address generate_partial_subtype_check() {
723    __ align(CodeEntryAlignment);
724    StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
725    address start = __ pc();
726    Label miss;
727
728    __ save_frame(0);
729    Register Rret   = I0;
730    Register Rsub   = I1;
731    Register Rsuper = I2;
732
733    Register L0_ary_len = L0;
734    Register L1_ary_ptr = L1;
735    Register L2_super   = L2;
736    Register L3_index   = L3;
737
738    __ check_klass_subtype_slow_path(Rsub, Rsuper,
739                                     L0, L1, L2, L3,
740                                     NULL, &miss);
741
742    // Match falls through here.
743    __ addcc(G0,0,Rret);        // set Z flags, Z result
744
745    __ ret();                   // Result in Rret is zero; flags set to Z
746    __ delayed()->restore();
747
748    __ BIND(miss);
749    __ addcc(G0,1,Rret);        // set NZ flags, NZ result
750
751    __ ret();                   // Result in Rret is != 0; flags set to NZ
752    __ delayed()->restore();
753
754    return start;
755  }
756
757
758  // Called from MacroAssembler::verify_oop
759  //
760  address generate_verify_oop_subroutine() {
761    StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
762
763    address start = __ pc();
764
765    __ verify_oop_subroutine();
766
767    return start;
768  }
769
770
771  //
772  // Verify that a register contains clean 32-bits positive value
773  // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
774  //
775  //  Input:
776  //    Rint  -  32-bits value
777  //    Rtmp  -  scratch
778  //
779  void assert_clean_int(Register Rint, Register Rtmp) {
780  #if defined(ASSERT)
781    __ signx(Rint, Rtmp);
782    __ cmp(Rint, Rtmp);
783    __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
784  #endif
785  }
786
787  //
788  //  Generate overlap test for array copy stubs
789  //
790  //  Input:
791  //    O0    -  array1
792  //    O1    -  array2
793  //    O2    -  element count
794  //
795  //  Kills temps:  O3, O4
796  //
797  void array_overlap_test(address no_overlap_target, int log2_elem_size) {
798    assert(no_overlap_target != NULL, "must be generated");
799    array_overlap_test(no_overlap_target, NULL, log2_elem_size);
800  }
801  void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
802    array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
803  }
804  void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
805    const Register from       = O0;
806    const Register to         = O1;
807    const Register count      = O2;
808    const Register to_from    = O3; // to - from
809    const Register byte_count = O4; // count << log2_elem_size
810
811      __ subcc(to, from, to_from);
812      __ sll_ptr(count, log2_elem_size, byte_count);
813      if (NOLp == NULL)
814        __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
815      else
816        __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
817      __ delayed()->cmp(to_from, byte_count);
818      if (NOLp == NULL)
819        __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
820      else
821        __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
822      __ delayed()->nop();
823  }
824
825  //
826  //  Generate pre-write barrier for array.
827  //
828  //  Input:
829  //     addr     - register containing starting address
830  //     count    - register containing element count
831  //     tmp      - scratch register
832  //
833  //  The input registers are overwritten.
834  //
835  void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
836    BarrierSet* bs = Universe::heap()->barrier_set();
837    switch (bs->kind()) {
838      case BarrierSet::G1SATBCTLogging:
839        // With G1, don't generate the call if we statically know that the target in uninitialized
840        if (!dest_uninitialized) {
841          __ save_frame(0);
842          // Save the necessary global regs... will be used after.
843          if (addr->is_global()) {
844            __ mov(addr, L0);
845          }
846          if (count->is_global()) {
847            __ mov(count, L1);
848          }
849          __ mov(addr->after_save(), O0);
850          // Get the count into O1
851          __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
852          __ delayed()->mov(count->after_save(), O1);
853          if (addr->is_global()) {
854            __ mov(L0, addr);
855          }
856          if (count->is_global()) {
857            __ mov(L1, count);
858          }
859          __ restore();
860        }
861        break;
862      case BarrierSet::CardTableForRS:
863      case BarrierSet::CardTableExtension:
864      case BarrierSet::ModRef:
865        break;
866      default:
867        ShouldNotReachHere();
868    }
869  }
870  //
871  //  Generate post-write barrier for array.
872  //
873  //  Input:
874  //     addr     - register containing starting address
875  //     count    - register containing element count
876  //     tmp      - scratch register
877  //
878  //  The input registers are overwritten.
879  //
880  void gen_write_ref_array_post_barrier(Register addr, Register count,
881                                        Register tmp) {
882    BarrierSet* bs = Universe::heap()->barrier_set();
883
884    switch (bs->kind()) {
885      case BarrierSet::G1SATBCTLogging:
886        {
887          // Get some new fresh output registers.
888          __ save_frame(0);
889          __ mov(addr->after_save(), O0);
890          __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
891          __ delayed()->mov(count->after_save(), O1);
892          __ restore();
893        }
894        break;
895      case BarrierSet::CardTableForRS:
896      case BarrierSet::CardTableExtension:
897        {
898          CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
899          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
900          assert_different_registers(addr, count, tmp);
901
902          Label L_loop;
903
904          __ sll_ptr(count, LogBytesPerHeapOop, count);
905          __ sub(count, BytesPerHeapOop, count);
906          __ add(count, addr, count);
907          // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
908          __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
909          __ srl_ptr(count, CardTableModRefBS::card_shift, count);
910          __ sub(count, addr, count);
911          AddressLiteral rs(ct->byte_map_base);
912          __ set(rs, tmp);
913        __ BIND(L_loop);
914          __ stb(G0, tmp, addr);
915          __ subcc(count, 1, count);
916          __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
917          __ delayed()->add(addr, 1, addr);
918        }
919        break;
920      case BarrierSet::ModRef:
921        break;
922      default:
923        ShouldNotReachHere();
924    }
925  }
926
927  //
928  // Generate main code for disjoint arraycopy
929  //
930  typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
931                                              Label& L_loop, bool use_prefetch, bool use_bis);
932
933  void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
934                          int iter_size, StubGenerator::CopyLoopFunc copy_loop_func) {
935    Label L_copy;
936
937    assert(log2_elem_size <= 3, "the following code should be changed");
938    int count_dec = 16>>log2_elem_size;
939
940    int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
941    assert(prefetch_dist < 4096, "invalid value");
942    prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
943    int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
944
945    if (UseBlockCopy) {
946      Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
947
948      // 64 bytes tail + bytes copied in one loop iteration
949      int tail_size = 64 + iter_size;
950      int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
951      // Use BIS copy only for big arrays since it requires membar.
952      __ set(block_copy_count, O4);
953      __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
954      // This code is for disjoint source and destination:
955      //   to <= from || to >= from+count
956      // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
957      __ sub(from, to, O4);
958      __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
959      __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
960
961      __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
962      // BIS should not be used to copy tail (64 bytes+iter_size)
963      // to avoid zeroing of following values.
964      __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
965
966      if (prefetch_count > 0) { // rounded up to one iteration count
967        // Do prefetching only if copy size is bigger
968        // than prefetch distance.
969        __ set(prefetch_count, O4);
970        __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
971        __ sub(count, O4, count);
972
973        (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
974        __ set(prefetch_count, O4);
975        __ add(count, O4, count);
976
977      } // prefetch_count > 0
978
979      (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
980      __ add(count, (tail_size>>log2_elem_size), count); // restore count
981
982      __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
983      // BIS needs membar.
984      __ membar(Assembler::StoreLoad);
985      // Copy tail
986      __ ba_short(L_copy);
987
988      __ BIND(L_skip_block_copy);
989    } // UseBlockCopy
990
991    if (prefetch_count > 0) { // rounded up to one iteration count
992      // Do prefetching only if copy size is bigger
993      // than prefetch distance.
994      __ set(prefetch_count, O4);
995      __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
996      __ sub(count, O4, count);
997
998      Label L_copy_prefetch;
999      (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
1000      __ set(prefetch_count, O4);
1001      __ add(count, O4, count);
1002
1003    } // prefetch_count > 0
1004
1005    (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
1006  }
1007
1008
1009
1010  //
1011  // Helper methods for copy_16_bytes_forward_with_shift()
1012  //
1013  void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
1014                                Label& L_loop, bool use_prefetch, bool use_bis) {
1015
1016    const Register left_shift  = G1; // left  shift bit counter
1017    const Register right_shift = G5; // right shift bit counter
1018
1019    __ align(OptoLoopAlignment);
1020    __ BIND(L_loop);
1021    if (use_prefetch) {
1022      if (ArraycopySrcPrefetchDistance > 0) {
1023        __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1024      }
1025      if (ArraycopyDstPrefetchDistance > 0) {
1026        __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1027      }
1028    }
1029    __ ldx(from, 0, O4);
1030    __ ldx(from, 8, G4);
1031    __ inc(to, 16);
1032    __ inc(from, 16);
1033    __ deccc(count, count_dec); // Can we do next iteration after this one?
1034    __ srlx(O4, right_shift, G3);
1035    __ bset(G3, O3);
1036    __ sllx(O4, left_shift,  O4);
1037    __ srlx(G4, right_shift, G3);
1038    __ bset(G3, O4);
1039    if (use_bis) {
1040      __ stxa(O3, to, -16);
1041      __ stxa(O4, to, -8);
1042    } else {
1043      __ stx(O3, to, -16);
1044      __ stx(O4, to, -8);
1045    }
1046    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1047    __ delayed()->sllx(G4, left_shift,  O3);
1048  }
1049
1050  // Copy big chunks forward with shift
1051  //
1052  // Inputs:
1053  //   from      - source arrays
1054  //   to        - destination array aligned to 8-bytes
1055  //   count     - elements count to copy >= the count equivalent to 16 bytes
1056  //   count_dec - elements count's decrement equivalent to 16 bytes
1057  //   L_copy_bytes - copy exit label
1058  //
1059  void copy_16_bytes_forward_with_shift(Register from, Register to,
1060                     Register count, int log2_elem_size, Label& L_copy_bytes) {
1061    Label L_aligned_copy, L_copy_last_bytes;
1062    assert(log2_elem_size <= 3, "the following code should be changed");
1063    int count_dec = 16>>log2_elem_size;
1064
1065    // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1066    __ andcc(from, 7, G1); // misaligned bytes
1067    __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1068    __ delayed()->nop();
1069
1070    const Register left_shift  = G1; // left  shift bit counter
1071    const Register right_shift = G5; // right shift bit counter
1072
1073    __ sll(G1, LogBitsPerByte, left_shift);
1074    __ mov(64, right_shift);
1075    __ sub(right_shift, left_shift, right_shift);
1076
1077    //
1078    // Load 2 aligned 8-bytes chunks and use one from previous iteration
1079    // to form 2 aligned 8-bytes chunks to store.
1080    //
1081    __ dec(count, count_dec);   // Pre-decrement 'count'
1082    __ andn(from, 7, from);     // Align address
1083    __ ldx(from, 0, O3);
1084    __ inc(from, 8);
1085    __ sllx(O3, left_shift,  O3);
1086
1087    disjoint_copy_core(from, to, count, log2_elem_size, 16, &StubGenerator::copy_16_bytes_shift_loop);
1088
1089    __ inccc(count, count_dec>>1 ); // + 8 bytes
1090    __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1091    __ delayed()->inc(count, count_dec>>1); // restore 'count'
1092
1093    // copy 8 bytes, part of them already loaded in O3
1094    __ ldx(from, 0, O4);
1095    __ inc(to, 8);
1096    __ inc(from, 8);
1097    __ srlx(O4, right_shift, G3);
1098    __ bset(O3, G3);
1099    __ stx(G3, to, -8);
1100
1101    __ BIND(L_copy_last_bytes);
1102    __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1103    __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1104    __ delayed()->sub(from, right_shift, from);       // restore address
1105
1106    __ BIND(L_aligned_copy);
1107  }
1108
1109  // Copy big chunks backward with shift
1110  //
1111  // Inputs:
1112  //   end_from  - source arrays end address
1113  //   end_to    - destination array end address aligned to 8-bytes
1114  //   count     - elements count to copy >= the count equivalent to 16 bytes
1115  //   count_dec - elements count's decrement equivalent to 16 bytes
1116  //   L_aligned_copy - aligned copy exit label
1117  //   L_copy_bytes   - copy exit label
1118  //
1119  void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
1120                     Register count, int count_dec,
1121                     Label& L_aligned_copy, Label& L_copy_bytes) {
1122    Label L_loop, L_copy_last_bytes;
1123
1124    // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1125      __ andcc(end_from, 7, G1); // misaligned bytes
1126      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1127      __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
1128
1129    const Register left_shift  = G1; // left  shift bit counter
1130    const Register right_shift = G5; // right shift bit counter
1131
1132      __ sll(G1, LogBitsPerByte, left_shift);
1133      __ mov(64, right_shift);
1134      __ sub(right_shift, left_shift, right_shift);
1135
1136    //
1137    // Load 2 aligned 8-bytes chunks and use one from previous iteration
1138    // to form 2 aligned 8-bytes chunks to store.
1139    //
1140      __ andn(end_from, 7, end_from);     // Align address
1141      __ ldx(end_from, 0, O3);
1142      __ align(OptoLoopAlignment);
1143    __ BIND(L_loop);
1144      __ ldx(end_from, -8, O4);
1145      __ deccc(count, count_dec); // Can we do next iteration after this one?
1146      __ ldx(end_from, -16, G4);
1147      __ dec(end_to, 16);
1148      __ dec(end_from, 16);
1149      __ srlx(O3, right_shift, O3);
1150      __ sllx(O4, left_shift,  G3);
1151      __ bset(G3, O3);
1152      __ stx(O3, end_to, 8);
1153      __ srlx(O4, right_shift, O4);
1154      __ sllx(G4, left_shift,  G3);
1155      __ bset(G3, O4);
1156      __ stx(O4, end_to, 0);
1157      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1158      __ delayed()->mov(G4, O3);
1159
1160      __ inccc(count, count_dec>>1 ); // + 8 bytes
1161      __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1162      __ delayed()->inc(count, count_dec>>1); // restore 'count'
1163
1164      // copy 8 bytes, part of them already loaded in O3
1165      __ ldx(end_from, -8, O4);
1166      __ dec(end_to, 8);
1167      __ dec(end_from, 8);
1168      __ srlx(O3, right_shift, O3);
1169      __ sllx(O4, left_shift,  G3);
1170      __ bset(O3, G3);
1171      __ stx(G3, end_to, 0);
1172
1173    __ BIND(L_copy_last_bytes);
1174      __ srl(left_shift, LogBitsPerByte, left_shift);    // misaligned bytes
1175      __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1176      __ delayed()->add(end_from, left_shift, end_from); // restore address
1177  }
1178
1179  //
1180  //  Generate stub for disjoint byte copy.  If "aligned" is true, the
1181  //  "from" and "to" addresses are assumed to be heapword aligned.
1182  //
1183  // Arguments for generated stub:
1184  //      from:  O0
1185  //      to:    O1
1186  //      count: O2 treated as signed
1187  //
1188  address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
1189    __ align(CodeEntryAlignment);
1190    StubCodeMark mark(this, "StubRoutines", name);
1191    address start = __ pc();
1192
1193    Label L_skip_alignment, L_align;
1194    Label L_copy_byte, L_copy_byte_loop, L_exit;
1195
1196    const Register from      = O0;   // source array address
1197    const Register to        = O1;   // destination array address
1198    const Register count     = O2;   // elements count
1199    const Register offset    = O5;   // offset from start of arrays
1200    // O3, O4, G3, G4 are used as temp registers
1201
1202    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1203
1204    if (entry != NULL) {
1205      *entry = __ pc();
1206      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1207      BLOCK_COMMENT("Entry:");
1208    }
1209
1210    // for short arrays, just do single element copy
1211    __ cmp(count, 23); // 16 + 7
1212    __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1213    __ delayed()->mov(G0, offset);
1214
1215    if (aligned) {
1216      // 'aligned' == true when it is known statically during compilation
1217      // of this arraycopy call site that both 'from' and 'to' addresses
1218      // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1219      //
1220      // Aligned arrays have 4 bytes alignment in 32-bits VM
1221      // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
1222      //
1223    } else {
1224      // copy bytes to align 'to' on 8 byte boundary
1225      __ andcc(to, 7, G1); // misaligned bytes
1226      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1227      __ delayed()->neg(G1);
1228      __ inc(G1, 8);       // bytes need to copy to next 8-bytes alignment
1229      __ sub(count, G1, count);
1230    __ BIND(L_align);
1231      __ ldub(from, 0, O3);
1232      __ deccc(G1);
1233      __ inc(from);
1234      __ stb(O3, to, 0);
1235      __ br(Assembler::notZero, false, Assembler::pt, L_align);
1236      __ delayed()->inc(to);
1237    __ BIND(L_skip_alignment);
1238    }
1239    if (!aligned) {
1240      // Copy with shift 16 bytes per iteration if arrays do not have
1241      // the same alignment mod 8, otherwise fall through to the next
1242      // code for aligned copy.
1243      // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1244      // Also jump over aligned copy after the copy with shift completed.
1245
1246      copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
1247    }
1248
1249    // Both array are 8 bytes aligned, copy 16 bytes at a time
1250      __ and3(count, 7, G4); // Save count
1251      __ srl(count, 3, count);
1252     generate_disjoint_long_copy_core(aligned);
1253      __ mov(G4, count);     // Restore count
1254
1255    // copy tailing bytes
1256    __ BIND(L_copy_byte);
1257      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1258      __ align(OptoLoopAlignment);
1259    __ BIND(L_copy_byte_loop);
1260      __ ldub(from, offset, O3);
1261      __ deccc(count);
1262      __ stb(O3, to, offset);
1263      __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1264      __ delayed()->inc(offset);
1265
1266    __ BIND(L_exit);
1267      // O3, O4 are used as temp registers
1268      inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1269      __ retl();
1270      __ delayed()->mov(G0, O0); // return 0
1271    return start;
1272  }
1273
1274  //
1275  //  Generate stub for conjoint byte copy.  If "aligned" is true, the
1276  //  "from" and "to" addresses are assumed to be heapword aligned.
1277  //
1278  // Arguments for generated stub:
1279  //      from:  O0
1280  //      to:    O1
1281  //      count: O2 treated as signed
1282  //
1283  address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1284                                      address *entry, const char *name) {
1285    // Do reverse copy.
1286
1287    __ align(CodeEntryAlignment);
1288    StubCodeMark mark(this, "StubRoutines", name);
1289    address start = __ pc();
1290
1291    Label L_skip_alignment, L_align, L_aligned_copy;
1292    Label L_copy_byte, L_copy_byte_loop, L_exit;
1293
1294    const Register from      = O0;   // source array address
1295    const Register to        = O1;   // destination array address
1296    const Register count     = O2;   // elements count
1297    const Register end_from  = from; // source array end address
1298    const Register end_to    = to;   // destination array end address
1299
1300    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1301
1302    if (entry != NULL) {
1303      *entry = __ pc();
1304      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1305      BLOCK_COMMENT("Entry:");
1306    }
1307
1308    array_overlap_test(nooverlap_target, 0);
1309
1310    __ add(to, count, end_to);       // offset after last copied element
1311
1312    // for short arrays, just do single element copy
1313    __ cmp(count, 23); // 16 + 7
1314    __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1315    __ delayed()->add(from, count, end_from);
1316
1317    {
1318      // Align end of arrays since they could be not aligned even
1319      // when arrays itself are aligned.
1320
1321      // copy bytes to align 'end_to' on 8 byte boundary
1322      __ andcc(end_to, 7, G1); // misaligned bytes
1323      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1324      __ delayed()->nop();
1325      __ sub(count, G1, count);
1326    __ BIND(L_align);
1327      __ dec(end_from);
1328      __ dec(end_to);
1329      __ ldub(end_from, 0, O3);
1330      __ deccc(G1);
1331      __ brx(Assembler::notZero, false, Assembler::pt, L_align);
1332      __ delayed()->stb(O3, end_to, 0);
1333    __ BIND(L_skip_alignment);
1334    }
1335    if (aligned) {
1336      // Both arrays are aligned to 8-bytes in 64-bits VM.
1337      // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1338      // in unaligned case.
1339      __ dec(count, 16);
1340    } else {
1341      // Copy with shift 16 bytes per iteration if arrays do not have
1342      // the same alignment mod 8, otherwise jump to the next
1343      // code for aligned copy (and substracting 16 from 'count' before jump).
1344      // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1345      // Also jump over aligned copy after the copy with shift completed.
1346
1347      copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
1348                                        L_aligned_copy, L_copy_byte);
1349    }
1350    // copy 4 elements (16 bytes) at a time
1351      __ align(OptoLoopAlignment);
1352    __ BIND(L_aligned_copy);
1353      __ dec(end_from, 16);
1354      __ ldx(end_from, 8, O3);
1355      __ ldx(end_from, 0, O4);
1356      __ dec(end_to, 16);
1357      __ deccc(count, 16);
1358      __ stx(O3, end_to, 8);
1359      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1360      __ delayed()->stx(O4, end_to, 0);
1361      __ inc(count, 16);
1362
1363    // copy 1 element (2 bytes) at a time
1364    __ BIND(L_copy_byte);
1365      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1366      __ align(OptoLoopAlignment);
1367    __ BIND(L_copy_byte_loop);
1368      __ dec(end_from);
1369      __ dec(end_to);
1370      __ ldub(end_from, 0, O4);
1371      __ deccc(count);
1372      __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
1373      __ delayed()->stb(O4, end_to, 0);
1374
1375    __ BIND(L_exit);
1376    // O3, O4 are used as temp registers
1377    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1378    __ retl();
1379    __ delayed()->mov(G0, O0); // return 0
1380    return start;
1381  }
1382
1383  //
1384  //  Generate stub for disjoint short copy.  If "aligned" is true, the
1385  //  "from" and "to" addresses are assumed to be heapword aligned.
1386  //
1387  // Arguments for generated stub:
1388  //      from:  O0
1389  //      to:    O1
1390  //      count: O2 treated as signed
1391  //
1392  address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
1393    __ align(CodeEntryAlignment);
1394    StubCodeMark mark(this, "StubRoutines", name);
1395    address start = __ pc();
1396
1397    Label L_skip_alignment, L_skip_alignment2;
1398    Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1399
1400    const Register from      = O0;   // source array address
1401    const Register to        = O1;   // destination array address
1402    const Register count     = O2;   // elements count
1403    const Register offset    = O5;   // offset from start of arrays
1404    // O3, O4, G3, G4 are used as temp registers
1405
1406    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1407
1408    if (entry != NULL) {
1409      *entry = __ pc();
1410      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1411      BLOCK_COMMENT("Entry:");
1412    }
1413
1414    // for short arrays, just do single element copy
1415    __ cmp(count, 11); // 8 + 3  (22 bytes)
1416    __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1417    __ delayed()->mov(G0, offset);
1418
1419    if (aligned) {
1420      // 'aligned' == true when it is known statically during compilation
1421      // of this arraycopy call site that both 'from' and 'to' addresses
1422      // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1423      //
1424      // Aligned arrays have 4 bytes alignment in 32-bits VM
1425      // and 8 bytes - in 64-bits VM.
1426      //
1427    } else {
1428      // copy 1 element if necessary to align 'to' on an 4 bytes
1429      __ andcc(to, 3, G0);
1430      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1431      __ delayed()->lduh(from, 0, O3);
1432      __ inc(from, 2);
1433      __ inc(to, 2);
1434      __ dec(count);
1435      __ sth(O3, to, -2);
1436    __ BIND(L_skip_alignment);
1437
1438      // copy 2 elements to align 'to' on an 8 byte boundary
1439      __ andcc(to, 7, G0);
1440      __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1441      __ delayed()->lduh(from, 0, O3);
1442      __ dec(count, 2);
1443      __ lduh(from, 2, O4);
1444      __ inc(from, 4);
1445      __ inc(to, 4);
1446      __ sth(O3, to, -4);
1447      __ sth(O4, to, -2);
1448    __ BIND(L_skip_alignment2);
1449    }
1450    if (!aligned) {
1451      // Copy with shift 16 bytes per iteration if arrays do not have
1452      // the same alignment mod 8, otherwise fall through to the next
1453      // code for aligned copy.
1454      // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1455      // Also jump over aligned copy after the copy with shift completed.
1456
1457      copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
1458    }
1459
1460    // Both array are 8 bytes aligned, copy 16 bytes at a time
1461      __ and3(count, 3, G4); // Save
1462      __ srl(count, 2, count);
1463     generate_disjoint_long_copy_core(aligned);
1464      __ mov(G4, count); // restore
1465
1466    // copy 1 element at a time
1467    __ BIND(L_copy_2_bytes);
1468      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1469      __ align(OptoLoopAlignment);
1470    __ BIND(L_copy_2_bytes_loop);
1471      __ lduh(from, offset, O3);
1472      __ deccc(count);
1473      __ sth(O3, to, offset);
1474      __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1475      __ delayed()->inc(offset, 2);
1476
1477    __ BIND(L_exit);
1478      // O3, O4 are used as temp registers
1479      inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1480      __ retl();
1481      __ delayed()->mov(G0, O0); // return 0
1482    return start;
1483  }
1484
1485  //
1486  //  Generate stub for disjoint short fill.  If "aligned" is true, the
1487  //  "to" address is assumed to be heapword aligned.
1488  //
1489  // Arguments for generated stub:
1490  //      to:    O0
1491  //      value: O1
1492  //      count: O2 treated as signed
1493  //
1494  address generate_fill(BasicType t, bool aligned, const char* name) {
1495    __ align(CodeEntryAlignment);
1496    StubCodeMark mark(this, "StubRoutines", name);
1497    address start = __ pc();
1498
1499    const Register to        = O0;   // source array address
1500    const Register value     = O1;   // fill value
1501    const Register count     = O2;   // elements count
1502    // O3 is used as a temp register
1503
1504    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1505
1506    Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
1507    Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
1508
1509    int shift = -1;
1510    switch (t) {
1511       case T_BYTE:
1512        shift = 2;
1513        break;
1514       case T_SHORT:
1515        shift = 1;
1516        break;
1517      case T_INT:
1518         shift = 0;
1519        break;
1520      default: ShouldNotReachHere();
1521    }
1522
1523    BLOCK_COMMENT("Entry:");
1524
1525    if (t == T_BYTE) {
1526      // Zero extend value
1527      __ and3(value, 0xff, value);
1528      __ sllx(value, 8, O3);
1529      __ or3(value, O3, value);
1530    }
1531    if (t == T_SHORT) {
1532      // Zero extend value
1533      __ sllx(value, 48, value);
1534      __ srlx(value, 48, value);
1535    }
1536    if (t == T_BYTE || t == T_SHORT) {
1537      __ sllx(value, 16, O3);
1538      __ or3(value, O3, value);
1539    }
1540
1541    __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
1542    __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
1543    __ delayed()->andcc(count, 1, G0);
1544
1545    if (!aligned && (t == T_BYTE || t == T_SHORT)) {
1546      // align source address at 4 bytes address boundary
1547      if (t == T_BYTE) {
1548        // One byte misalignment happens only for byte arrays
1549        __ andcc(to, 1, G0);
1550        __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
1551        __ delayed()->nop();
1552        __ stb(value, to, 0);
1553        __ inc(to, 1);
1554        __ dec(count, 1);
1555        __ BIND(L_skip_align1);
1556      }
1557      // Two bytes misalignment happens only for byte and short (char) arrays
1558      __ andcc(to, 2, G0);
1559      __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
1560      __ delayed()->nop();
1561      __ sth(value, to, 0);
1562      __ inc(to, 2);
1563      __ dec(count, 1 << (shift - 1));
1564      __ BIND(L_skip_align2);
1565    }
1566    if (!aligned) {
1567      // align to 8 bytes, we know we are 4 byte aligned to start
1568      __ andcc(to, 7, G0);
1569      __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
1570      __ delayed()->nop();
1571      __ stw(value, to, 0);
1572      __ inc(to, 4);
1573      __ dec(count, 1 << shift);
1574      __ BIND(L_fill_32_bytes);
1575    }
1576
1577    if (t == T_INT) {
1578      // Zero extend value
1579      __ srl(value, 0, value);
1580    }
1581    if (t == T_BYTE || t == T_SHORT || t == T_INT) {
1582      __ sllx(value, 32, O3);
1583      __ or3(value, O3, value);
1584    }
1585
1586    Label L_check_fill_8_bytes;
1587    // Fill 32-byte chunks
1588    __ subcc(count, 8 << shift, count);
1589    __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
1590    __ delayed()->nop();
1591
1592    Label L_fill_32_bytes_loop, L_fill_4_bytes;
1593    __ align(16);
1594    __ BIND(L_fill_32_bytes_loop);
1595
1596    __ stx(value, to, 0);
1597    __ stx(value, to, 8);
1598    __ stx(value, to, 16);
1599    __ stx(value, to, 24);
1600
1601    __ subcc(count, 8 << shift, count);
1602    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
1603    __ delayed()->add(to, 32, to);
1604
1605    __ BIND(L_check_fill_8_bytes);
1606    __ addcc(count, 8 << shift, count);
1607    __ brx(Assembler::zero, false, Assembler::pn, L_exit);
1608    __ delayed()->subcc(count, 1 << (shift + 1), count);
1609    __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
1610    __ delayed()->andcc(count, 1<<shift, G0);
1611
1612    //
1613    // length is too short, just fill 8 bytes at a time
1614    //
1615    Label L_fill_8_bytes_loop;
1616    __ BIND(L_fill_8_bytes_loop);
1617    __ stx(value, to, 0);
1618    __ subcc(count, 1 << (shift + 1), count);
1619    __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
1620    __ delayed()->add(to, 8, to);
1621
1622    // fill trailing 4 bytes
1623    __ andcc(count, 1<<shift, G0);  // in delay slot of branches
1624    if (t == T_INT) {
1625      __ BIND(L_fill_elements);
1626    }
1627    __ BIND(L_fill_4_bytes);
1628    __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
1629    if (t == T_BYTE || t == T_SHORT) {
1630      __ delayed()->andcc(count, 1<<(shift-1), G0);
1631    } else {
1632      __ delayed()->nop();
1633    }
1634    __ stw(value, to, 0);
1635    if (t == T_BYTE || t == T_SHORT) {
1636      __ inc(to, 4);
1637      // fill trailing 2 bytes
1638      __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
1639      __ BIND(L_fill_2_bytes);
1640      __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
1641      __ delayed()->andcc(count, 1, count);
1642      __ sth(value, to, 0);
1643      if (t == T_BYTE) {
1644        __ inc(to, 2);
1645        // fill trailing byte
1646        __ andcc(count, 1, count);  // in delay slot of branches
1647        __ BIND(L_fill_byte);
1648        __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1649        __ delayed()->nop();
1650        __ stb(value, to, 0);
1651      } else {
1652        __ BIND(L_fill_byte);
1653      }
1654    } else {
1655      __ BIND(L_fill_2_bytes);
1656    }
1657    __ BIND(L_exit);
1658    __ retl();
1659    __ delayed()->nop();
1660
1661    // Handle copies less than 8 bytes.  Int is handled elsewhere.
1662    if (t == T_BYTE) {
1663      __ BIND(L_fill_elements);
1664      Label L_fill_2, L_fill_4;
1665      // in delay slot __ andcc(count, 1, G0);
1666      __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1667      __ delayed()->andcc(count, 2, G0);
1668      __ stb(value, to, 0);
1669      __ inc(to, 1);
1670      __ BIND(L_fill_2);
1671      __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
1672      __ delayed()->andcc(count, 4, G0);
1673      __ stb(value, to, 0);
1674      __ stb(value, to, 1);
1675      __ inc(to, 2);
1676      __ BIND(L_fill_4);
1677      __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1678      __ delayed()->nop();
1679      __ stb(value, to, 0);
1680      __ stb(value, to, 1);
1681      __ stb(value, to, 2);
1682      __ retl();
1683      __ delayed()->stb(value, to, 3);
1684    }
1685
1686    if (t == T_SHORT) {
1687      Label L_fill_2;
1688      __ BIND(L_fill_elements);
1689      // in delay slot __ andcc(count, 1, G0);
1690      __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1691      __ delayed()->andcc(count, 2, G0);
1692      __ sth(value, to, 0);
1693      __ inc(to, 2);
1694      __ BIND(L_fill_2);
1695      __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1696      __ delayed()->nop();
1697      __ sth(value, to, 0);
1698      __ retl();
1699      __ delayed()->sth(value, to, 2);
1700    }
1701    return start;
1702  }
1703
1704  //
1705  //  Generate stub for conjoint short copy.  If "aligned" is true, the
1706  //  "from" and "to" addresses are assumed to be heapword aligned.
1707  //
1708  // Arguments for generated stub:
1709  //      from:  O0
1710  //      to:    O1
1711  //      count: O2 treated as signed
1712  //
1713  address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1714                                       address *entry, const char *name) {
1715    // Do reverse copy.
1716
1717    __ align(CodeEntryAlignment);
1718    StubCodeMark mark(this, "StubRoutines", name);
1719    address start = __ pc();
1720
1721    Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
1722    Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1723
1724    const Register from      = O0;   // source array address
1725    const Register to        = O1;   // destination array address
1726    const Register count     = O2;   // elements count
1727    const Register end_from  = from; // source array end address
1728    const Register end_to    = to;   // destination array end address
1729
1730    const Register byte_count = O3;  // bytes count to copy
1731
1732    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1733
1734    if (entry != NULL) {
1735      *entry = __ pc();
1736      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1737      BLOCK_COMMENT("Entry:");
1738    }
1739
1740    array_overlap_test(nooverlap_target, 1);
1741
1742    __ sllx(count, LogBytesPerShort, byte_count);
1743    __ add(to, byte_count, end_to);  // offset after last copied element
1744
1745    // for short arrays, just do single element copy
1746    __ cmp(count, 11); // 8 + 3  (22 bytes)
1747    __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1748    __ delayed()->add(from, byte_count, end_from);
1749
1750    {
1751      // Align end of arrays since they could be not aligned even
1752      // when arrays itself are aligned.
1753
1754      // copy 1 element if necessary to align 'end_to' on an 4 bytes
1755      __ andcc(end_to, 3, G0);
1756      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1757      __ delayed()->lduh(end_from, -2, O3);
1758      __ dec(end_from, 2);
1759      __ dec(end_to, 2);
1760      __ dec(count);
1761      __ sth(O3, end_to, 0);
1762    __ BIND(L_skip_alignment);
1763
1764      // copy 2 elements to align 'end_to' on an 8 byte boundary
1765      __ andcc(end_to, 7, G0);
1766      __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1767      __ delayed()->lduh(end_from, -2, O3);
1768      __ dec(count, 2);
1769      __ lduh(end_from, -4, O4);
1770      __ dec(end_from, 4);
1771      __ dec(end_to, 4);
1772      __ sth(O3, end_to, 2);
1773      __ sth(O4, end_to, 0);
1774    __ BIND(L_skip_alignment2);
1775    }
1776    if (aligned) {
1777      // Both arrays are aligned to 8-bytes in 64-bits VM.
1778      // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1779      // in unaligned case.
1780      __ dec(count, 8);
1781    } else {
1782      // Copy with shift 16 bytes per iteration if arrays do not have
1783      // the same alignment mod 8, otherwise jump to the next
1784      // code for aligned copy (and substracting 8 from 'count' before jump).
1785      // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1786      // Also jump over aligned copy after the copy with shift completed.
1787
1788      copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
1789                                        L_aligned_copy, L_copy_2_bytes);
1790    }
1791    // copy 4 elements (16 bytes) at a time
1792      __ align(OptoLoopAlignment);
1793    __ BIND(L_aligned_copy);
1794      __ dec(end_from, 16);
1795      __ ldx(end_from, 8, O3);
1796      __ ldx(end_from, 0, O4);
1797      __ dec(end_to, 16);
1798      __ deccc(count, 8);
1799      __ stx(O3, end_to, 8);
1800      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1801      __ delayed()->stx(O4, end_to, 0);
1802      __ inc(count, 8);
1803
1804    // copy 1 element (2 bytes) at a time
1805    __ BIND(L_copy_2_bytes);
1806      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1807    __ BIND(L_copy_2_bytes_loop);
1808      __ dec(end_from, 2);
1809      __ dec(end_to, 2);
1810      __ lduh(end_from, 0, O4);
1811      __ deccc(count);
1812      __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
1813      __ delayed()->sth(O4, end_to, 0);
1814
1815    __ BIND(L_exit);
1816    // O3, O4 are used as temp registers
1817    inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1818    __ retl();
1819    __ delayed()->mov(G0, O0); // return 0
1820    return start;
1821  }
1822
1823  //
1824  // Helper methods for generate_disjoint_int_copy_core()
1825  //
1826  void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
1827                          Label& L_loop, bool use_prefetch, bool use_bis) {
1828
1829    __ align(OptoLoopAlignment);
1830    __ BIND(L_loop);
1831    if (use_prefetch) {
1832      if (ArraycopySrcPrefetchDistance > 0) {
1833        __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1834      }
1835      if (ArraycopyDstPrefetchDistance > 0) {
1836        __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1837      }
1838    }
1839    __ ldx(from, 4, O4);
1840    __ ldx(from, 12, G4);
1841    __ inc(to, 16);
1842    __ inc(from, 16);
1843    __ deccc(count, 4); // Can we do next iteration after this one?
1844
1845    __ srlx(O4, 32, G3);
1846    __ bset(G3, O3);
1847    __ sllx(O4, 32, O4);
1848    __ srlx(G4, 32, G3);
1849    __ bset(G3, O4);
1850    if (use_bis) {
1851      __ stxa(O3, to, -16);
1852      __ stxa(O4, to, -8);
1853    } else {
1854      __ stx(O3, to, -16);
1855      __ stx(O4, to, -8);
1856    }
1857    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1858    __ delayed()->sllx(G4, 32,  O3);
1859
1860  }
1861
1862  //
1863  //  Generate core code for disjoint int copy (and oop copy on 32-bit).
1864  //  If "aligned" is true, the "from" and "to" addresses are assumed
1865  //  to be heapword aligned.
1866  //
1867  // Arguments:
1868  //      from:  O0
1869  //      to:    O1
1870  //      count: O2 treated as signed
1871  //
1872  void generate_disjoint_int_copy_core(bool aligned) {
1873
1874    Label L_skip_alignment, L_aligned_copy;
1875    Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
1876
1877    const Register from      = O0;   // source array address
1878    const Register to        = O1;   // destination array address
1879    const Register count     = O2;   // elements count
1880    const Register offset    = O5;   // offset from start of arrays
1881    // O3, O4, G3, G4 are used as temp registers
1882
1883    // 'aligned' == true when it is known statically during compilation
1884    // of this arraycopy call site that both 'from' and 'to' addresses
1885    // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1886    //
1887    // Aligned arrays have 4 bytes alignment in 32-bits VM
1888    // and 8 bytes - in 64-bits VM.
1889    //
1890    if (!aligned) {
1891      // The next check could be put under 'ifndef' since the code in
1892      // generate_disjoint_long_copy_core() has own checks and set 'offset'.
1893
1894      // for short arrays, just do single element copy
1895      __ cmp(count, 5); // 4 + 1 (20 bytes)
1896      __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
1897      __ delayed()->mov(G0, offset);
1898
1899      // copy 1 element to align 'to' on an 8 byte boundary
1900      __ andcc(to, 7, G0);
1901      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1902      __ delayed()->ld(from, 0, O3);
1903      __ inc(from, 4);
1904      __ inc(to, 4);
1905      __ dec(count);
1906      __ st(O3, to, -4);
1907    __ BIND(L_skip_alignment);
1908
1909    // if arrays have same alignment mod 8, do 4 elements copy
1910      __ andcc(from, 7, G0);
1911      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1912      __ delayed()->ld(from, 0, O3);
1913
1914    //
1915    // Load 2 aligned 8-bytes chunks and use one from previous iteration
1916    // to form 2 aligned 8-bytes chunks to store.
1917    //
1918    // copy_16_bytes_forward_with_shift() is not used here since this
1919    // code is more optimal.
1920
1921    // copy with shift 4 elements (16 bytes) at a time
1922      __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
1923      __ sllx(O3, 32,  O3);
1924
1925      disjoint_copy_core(from, to, count, 2, 16, &StubGenerator::copy_16_bytes_loop);
1926
1927      __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
1928      __ delayed()->inc(count, 4); // restore 'count'
1929
1930    __ BIND(L_aligned_copy);
1931    } // !aligned
1932
1933    // copy 4 elements (16 bytes) at a time
1934      __ and3(count, 1, G4); // Save
1935      __ srl(count, 1, count);
1936     generate_disjoint_long_copy_core(aligned);
1937      __ mov(G4, count);     // Restore
1938
1939    // copy 1 element at a time
1940    __ BIND(L_copy_4_bytes);
1941      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1942    __ BIND(L_copy_4_bytes_loop);
1943      __ ld(from, offset, O3);
1944      __ deccc(count);
1945      __ st(O3, to, offset);
1946      __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
1947      __ delayed()->inc(offset, 4);
1948    __ BIND(L_exit);
1949  }
1950
1951  //
1952  //  Generate stub for disjoint int copy.  If "aligned" is true, the
1953  //  "from" and "to" addresses are assumed to be heapword aligned.
1954  //
1955  // Arguments for generated stub:
1956  //      from:  O0
1957  //      to:    O1
1958  //      count: O2 treated as signed
1959  //
1960  address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
1961    __ align(CodeEntryAlignment);
1962    StubCodeMark mark(this, "StubRoutines", name);
1963    address start = __ pc();
1964
1965    const Register count = O2;
1966    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1967
1968    if (entry != NULL) {
1969      *entry = __ pc();
1970      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1971      BLOCK_COMMENT("Entry:");
1972    }
1973
1974    generate_disjoint_int_copy_core(aligned);
1975
1976    // O3, O4 are used as temp registers
1977    inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
1978    __ retl();
1979    __ delayed()->mov(G0, O0); // return 0
1980    return start;
1981  }
1982
1983  //
1984  //  Generate core code for conjoint int copy (and oop copy on 32-bit).
1985  //  If "aligned" is true, the "from" and "to" addresses are assumed
1986  //  to be heapword aligned.
1987  //
1988  // Arguments:
1989  //      from:  O0
1990  //      to:    O1
1991  //      count: O2 treated as signed
1992  //
1993  void generate_conjoint_int_copy_core(bool aligned) {
1994    // Do reverse copy.
1995
1996    Label L_skip_alignment, L_aligned_copy;
1997    Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
1998
1999    const Register from      = O0;   // source array address
2000    const Register to        = O1;   // destination array address
2001    const Register count     = O2;   // elements count
2002    const Register end_from  = from; // source array end address
2003    const Register end_to    = to;   // destination array end address
2004    // O3, O4, O5, G3 are used as temp registers
2005
2006    const Register byte_count = O3;  // bytes count to copy
2007
2008      __ sllx(count, LogBytesPerInt, byte_count);
2009      __ add(to, byte_count, end_to); // offset after last copied element
2010
2011      __ cmp(count, 5); // for short arrays, just do single element copy
2012      __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2013      __ delayed()->add(from, byte_count, end_from);
2014
2015    // copy 1 element to align 'to' on an 8 byte boundary
2016      __ andcc(end_to, 7, G0);
2017      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2018      __ delayed()->nop();
2019      __ dec(count);
2020      __ dec(end_from, 4);
2021      __ dec(end_to,   4);
2022      __ ld(end_from, 0, O4);
2023      __ st(O4, end_to, 0);
2024    __ BIND(L_skip_alignment);
2025
2026    // Check if 'end_from' and 'end_to' has the same alignment.
2027      __ andcc(end_from, 7, G0);
2028      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2029      __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
2030
2031    // copy with shift 4 elements (16 bytes) at a time
2032    //
2033    // Load 2 aligned 8-bytes chunks and use one from previous iteration
2034    // to form 2 aligned 8-bytes chunks to store.
2035    //
2036      __ ldx(end_from, -4, O3);
2037      __ align(OptoLoopAlignment);
2038    __ BIND(L_copy_16_bytes);
2039      __ ldx(end_from, -12, O4);
2040      __ deccc(count, 4);
2041      __ ldx(end_from, -20, O5);
2042      __ dec(end_to, 16);
2043      __ dec(end_from, 16);
2044      __ srlx(O3, 32, O3);
2045      __ sllx(O4, 32, G3);
2046      __ bset(G3, O3);
2047      __ stx(O3, end_to, 8);
2048      __ srlx(O4, 32, O4);
2049      __ sllx(O5, 32, G3);
2050      __ bset(O4, G3);
2051      __ stx(G3, end_to, 0);
2052      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2053      __ delayed()->mov(O5, O3);
2054
2055      __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2056      __ delayed()->inc(count, 4);
2057
2058    // copy 4 elements (16 bytes) at a time
2059      __ align(OptoLoopAlignment);
2060    __ BIND(L_aligned_copy);
2061      __ dec(end_from, 16);
2062      __ ldx(end_from, 8, O3);
2063      __ ldx(end_from, 0, O4);
2064      __ dec(end_to, 16);
2065      __ deccc(count, 4);
2066      __ stx(O3, end_to, 8);
2067      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2068      __ delayed()->stx(O4, end_to, 0);
2069      __ inc(count, 4);
2070
2071    // copy 1 element (4 bytes) at a time
2072    __ BIND(L_copy_4_bytes);
2073      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2074    __ BIND(L_copy_4_bytes_loop);
2075      __ dec(end_from, 4);
2076      __ dec(end_to, 4);
2077      __ ld(end_from, 0, O4);
2078      __ deccc(count);
2079      __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
2080      __ delayed()->st(O4, end_to, 0);
2081    __ BIND(L_exit);
2082  }
2083
2084  //
2085  //  Generate stub for conjoint int copy.  If "aligned" is true, the
2086  //  "from" and "to" addresses are assumed to be heapword aligned.
2087  //
2088  // Arguments for generated stub:
2089  //      from:  O0
2090  //      to:    O1
2091  //      count: O2 treated as signed
2092  //
2093  address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
2094                                     address *entry, const char *name) {
2095    __ align(CodeEntryAlignment);
2096    StubCodeMark mark(this, "StubRoutines", name);
2097    address start = __ pc();
2098
2099    assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2100
2101    if (entry != NULL) {
2102      *entry = __ pc();
2103      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2104      BLOCK_COMMENT("Entry:");
2105    }
2106
2107    array_overlap_test(nooverlap_target, 2);
2108
2109    generate_conjoint_int_copy_core(aligned);
2110
2111    // O3, O4 are used as temp registers
2112    inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2113    __ retl();
2114    __ delayed()->mov(G0, O0); // return 0
2115    return start;
2116  }
2117
2118  //
2119  // Helper methods for generate_disjoint_long_copy_core()
2120  //
2121  void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
2122                          Label& L_loop, bool use_prefetch, bool use_bis) {
2123    __ align(OptoLoopAlignment);
2124    __ BIND(L_loop);
2125    for (int off = 0; off < 64; off += 16) {
2126      if (use_prefetch && (off & 31) == 0) {
2127        if (ArraycopySrcPrefetchDistance > 0) {
2128          __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads);
2129        }
2130        if (ArraycopyDstPrefetchDistance > 0) {
2131          __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads);
2132        }
2133      }
2134      __ ldx(from,  off+0, O4);
2135      __ ldx(from,  off+8, O5);
2136      if (use_bis) {
2137        __ stxa(O4, to,  off+0);
2138        __ stxa(O5, to,  off+8);
2139      } else {
2140        __ stx(O4, to,  off+0);
2141        __ stx(O5, to,  off+8);
2142      }
2143    }
2144    __ deccc(count, 8);
2145    __ inc(from, 64);
2146    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2147    __ delayed()->inc(to, 64);
2148  }
2149
2150  //
2151  //  Generate core code for disjoint long copy (and oop copy on 64-bit).
2152  //  "aligned" is ignored, because we must make the stronger
2153  //  assumption that both addresses are always 64-bit aligned.
2154  //
2155  // Arguments:
2156  //      from:  O0
2157  //      to:    O1
2158  //      count: O2 treated as signed
2159  //
2160  // count -= 2;
2161  // if ( count >= 0 ) { // >= 2 elements
2162  //   if ( count > 6) { // >= 8 elements
2163  //     count -= 6; // original count - 8
2164  //     do {
2165  //       copy_8_elements;
2166  //       count -= 8;
2167  //     } while ( count >= 0 );
2168  //     count += 6;
2169  //   }
2170  //   if ( count >= 0 ) { // >= 2 elements
2171  //     do {
2172  //       copy_2_elements;
2173  //     } while ( (count=count-2) >= 0 );
2174  //   }
2175  // }
2176  // count += 2;
2177  // if ( count != 0 ) { // 1 element left
2178  //   copy_1_element;
2179  // }
2180  //
2181  void generate_disjoint_long_copy_core(bool aligned) {
2182    Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2183    const Register from    = O0;  // source array address
2184    const Register to      = O1;  // destination array address
2185    const Register count   = O2;  // elements count
2186    const Register offset0 = O4;  // element offset
2187    const Register offset8 = O5;  // next element offset
2188
2189    __ deccc(count, 2);
2190    __ mov(G0, offset0);   // offset from start of arrays (0)
2191    __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2192    __ delayed()->add(offset0, 8, offset8);
2193
2194    // Copy by 64 bytes chunks
2195
2196    const Register from64 = O3;  // source address
2197    const Register to64   = G3;  // destination address
2198    __ subcc(count, 6, O3);
2199    __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2200    __ delayed()->mov(to,   to64);
2201    // Now we can use O4(offset0), O5(offset8) as temps
2202    __ mov(O3, count);
2203    // count >= 0 (original count - 8)
2204    __ mov(from, from64);
2205
2206    disjoint_copy_core(from64, to64, count, 3, 64, &StubGenerator::copy_64_bytes_loop);
2207
2208      // Restore O4(offset0), O5(offset8)
2209      __ sub(from64, from, offset0);
2210      __ inccc(count, 6); // restore count
2211      __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2212      __ delayed()->add(offset0, 8, offset8);
2213
2214      // Copy by 16 bytes chunks
2215      __ align(OptoLoopAlignment);
2216    __ BIND(L_copy_16_bytes);
2217      __ ldx(from, offset0, O3);
2218      __ ldx(from, offset8, G3);
2219      __ deccc(count, 2);
2220      __ stx(O3, to, offset0);
2221      __ inc(offset0, 16);
2222      __ stx(G3, to, offset8);
2223      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2224      __ delayed()->inc(offset8, 16);
2225
2226      // Copy last 8 bytes
2227    __ BIND(L_copy_8_bytes);
2228      __ inccc(count, 2);
2229      __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2230      __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
2231      __ ldx(from, offset0, O3);
2232      __ stx(O3, to, offset0);
2233    __ BIND(L_exit);
2234  }
2235
2236  //
2237  //  Generate stub for disjoint long copy.
2238  //  "aligned" is ignored, because we must make the stronger
2239  //  assumption that both addresses are always 64-bit aligned.
2240  //
2241  // Arguments for generated stub:
2242  //      from:  O0
2243  //      to:    O1
2244  //      count: O2 treated as signed
2245  //
2246  address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
2247    __ align(CodeEntryAlignment);
2248    StubCodeMark mark(this, "StubRoutines", name);
2249    address start = __ pc();
2250
2251    assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2252
2253    if (entry != NULL) {
2254      *entry = __ pc();
2255      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2256      BLOCK_COMMENT("Entry:");
2257    }
2258
2259    generate_disjoint_long_copy_core(aligned);
2260
2261    // O3, O4 are used as temp registers
2262    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2263    __ retl();
2264    __ delayed()->mov(G0, O0); // return 0
2265    return start;
2266  }
2267
2268  //
2269  //  Generate core code for conjoint long copy (and oop copy on 64-bit).
2270  //  "aligned" is ignored, because we must make the stronger
2271  //  assumption that both addresses are always 64-bit aligned.
2272  //
2273  // Arguments:
2274  //      from:  O0
2275  //      to:    O1
2276  //      count: O2 treated as signed
2277  //
2278  void generate_conjoint_long_copy_core(bool aligned) {
2279    // Do reverse copy.
2280    Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2281    const Register from    = O0;  // source array address
2282    const Register to      = O1;  // destination array address
2283    const Register count   = O2;  // elements count
2284    const Register offset8 = O4;  // element offset
2285    const Register offset0 = O5;  // previous element offset
2286
2287      __ subcc(count, 1, count);
2288      __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
2289      __ delayed()->sllx(count, LogBytesPerLong, offset8);
2290      __ sub(offset8, 8, offset0);
2291      __ align(OptoLoopAlignment);
2292    __ BIND(L_copy_16_bytes);
2293      __ ldx(from, offset8, O2);
2294      __ ldx(from, offset0, O3);
2295      __ stx(O2, to, offset8);
2296      __ deccc(offset8, 16);      // use offset8 as counter
2297      __ stx(O3, to, offset0);
2298      __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
2299      __ delayed()->dec(offset0, 16);
2300
2301    __ BIND(L_copy_8_bytes);
2302      __ brx(Assembler::negative, false, Assembler::pn, L_exit );
2303      __ delayed()->nop();
2304      __ ldx(from, 0, O3);
2305      __ stx(O3, to, 0);
2306    __ BIND(L_exit);
2307  }
2308
2309  //  Generate stub for conjoint long copy.
2310  //  "aligned" is ignored, because we must make the stronger
2311  //  assumption that both addresses are always 64-bit aligned.
2312  //
2313  // Arguments for generated stub:
2314  //      from:  O0
2315  //      to:    O1
2316  //      count: O2 treated as signed
2317  //
2318  address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
2319                                      address *entry, const char *name) {
2320    __ align(CodeEntryAlignment);
2321    StubCodeMark mark(this, "StubRoutines", name);
2322    address start = __ pc();
2323
2324    assert(aligned, "Should always be aligned");
2325
2326    assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2327
2328    if (entry != NULL) {
2329      *entry = __ pc();
2330      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2331      BLOCK_COMMENT("Entry:");
2332    }
2333
2334    array_overlap_test(nooverlap_target, 3);
2335
2336    generate_conjoint_long_copy_core(aligned);
2337
2338    // O3, O4 are used as temp registers
2339    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2340    __ retl();
2341    __ delayed()->mov(G0, O0); // return 0
2342    return start;
2343  }
2344
2345  //  Generate stub for disjoint oop copy.  If "aligned" is true, the
2346  //  "from" and "to" addresses are assumed to be heapword aligned.
2347  //
2348  // Arguments for generated stub:
2349  //      from:  O0
2350  //      to:    O1
2351  //      count: O2 treated as signed
2352  //
2353  address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
2354                                     bool dest_uninitialized = false) {
2355
2356    const Register from  = O0;  // source array address
2357    const Register to    = O1;  // destination array address
2358    const Register count = O2;  // elements count
2359
2360    __ align(CodeEntryAlignment);
2361    StubCodeMark mark(this, "StubRoutines", name);
2362    address start = __ pc();
2363
2364    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2365
2366    if (entry != NULL) {
2367      *entry = __ pc();
2368      // caller can pass a 64-bit byte count here
2369      BLOCK_COMMENT("Entry:");
2370    }
2371
2372    // save arguments for barrier generation
2373    __ mov(to, G1);
2374    __ mov(count, G5);
2375    gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2376    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2377    if (UseCompressedOops) {
2378      generate_disjoint_int_copy_core(aligned);
2379    } else {
2380      generate_disjoint_long_copy_core(aligned);
2381    }
2382    // O0 is used as temp register
2383    gen_write_ref_array_post_barrier(G1, G5, O0);
2384
2385    // O3, O4 are used as temp registers
2386    inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2387    __ retl();
2388    __ delayed()->mov(G0, O0); // return 0
2389    return start;
2390  }
2391
2392  //  Generate stub for conjoint oop copy.  If "aligned" is true, the
2393  //  "from" and "to" addresses are assumed to be heapword aligned.
2394  //
2395  // Arguments for generated stub:
2396  //      from:  O0
2397  //      to:    O1
2398  //      count: O2 treated as signed
2399  //
2400  address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
2401                                     address *entry, const char *name,
2402                                     bool dest_uninitialized = false) {
2403
2404    const Register from  = O0;  // source array address
2405    const Register to    = O1;  // destination array address
2406    const Register count = O2;  // elements count
2407
2408    __ align(CodeEntryAlignment);
2409    StubCodeMark mark(this, "StubRoutines", name);
2410    address start = __ pc();
2411
2412    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2413
2414    if (entry != NULL) {
2415      *entry = __ pc();
2416      // caller can pass a 64-bit byte count here
2417      BLOCK_COMMENT("Entry:");
2418    }
2419
2420    array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
2421
2422    // save arguments for barrier generation
2423    __ mov(to, G1);
2424    __ mov(count, G5);
2425    gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2426
2427    if (UseCompressedOops) {
2428      generate_conjoint_int_copy_core(aligned);
2429    } else {
2430      generate_conjoint_long_copy_core(aligned);
2431    }
2432
2433    // O0 is used as temp register
2434    gen_write_ref_array_post_barrier(G1, G5, O0);
2435
2436    // O3, O4 are used as temp registers
2437    inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2438    __ retl();
2439    __ delayed()->mov(G0, O0); // return 0
2440    return start;
2441  }
2442
2443
2444  // Helper for generating a dynamic type check.
2445  // Smashes only the given temp registers.
2446  void generate_type_check(Register sub_klass,
2447                           Register super_check_offset,
2448                           Register super_klass,
2449                           Register temp,
2450                           Label& L_success) {
2451    assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
2452
2453    BLOCK_COMMENT("type_check:");
2454
2455    Label L_miss, L_pop_to_miss;
2456
2457    assert_clean_int(super_check_offset, temp);
2458
2459    __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
2460                                     &L_success, &L_miss, NULL,
2461                                     super_check_offset);
2462
2463    BLOCK_COMMENT("type_check_slow_path:");
2464    __ save_frame(0);
2465    __ check_klass_subtype_slow_path(sub_klass->after_save(),
2466                                     super_klass->after_save(),
2467                                     L0, L1, L2, L4,
2468                                     NULL, &L_pop_to_miss);
2469    __ ba(L_success);
2470    __ delayed()->restore();
2471
2472    __ bind(L_pop_to_miss);
2473    __ restore();
2474
2475    // Fall through on failure!
2476    __ BIND(L_miss);
2477  }
2478
2479
2480  //  Generate stub for checked oop copy.
2481  //
2482  // Arguments for generated stub:
2483  //      from:  O0
2484  //      to:    O1
2485  //      count: O2 treated as signed
2486  //      ckoff: O3 (super_check_offset)
2487  //      ckval: O4 (super_klass)
2488  //      ret:   O0 zero for success; (-1^K) where K is partial transfer count
2489  //
2490  address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
2491
2492    const Register O0_from   = O0;      // source array address
2493    const Register O1_to     = O1;      // destination array address
2494    const Register O2_count  = O2;      // elements count
2495    const Register O3_ckoff  = O3;      // super_check_offset
2496    const Register O4_ckval  = O4;      // super_klass
2497
2498    const Register O5_offset = O5;      // loop var, with stride wordSize
2499    const Register G1_remain = G1;      // loop var, with stride -1
2500    const Register G3_oop    = G3;      // actual oop copied
2501    const Register G4_klass  = G4;      // oop._klass
2502    const Register G5_super  = G5;      // oop._klass._primary_supers[ckval]
2503
2504    __ align(CodeEntryAlignment);
2505    StubCodeMark mark(this, "StubRoutines", name);
2506    address start = __ pc();
2507
2508#ifdef ASSERT
2509    // We sometimes save a frame (see generate_type_check below).
2510    // If this will cause trouble, let's fail now instead of later.
2511    __ save_frame(0);
2512    __ restore();
2513#endif
2514
2515    assert_clean_int(O2_count, G1);     // Make sure 'count' is clean int.
2516
2517#ifdef ASSERT
2518    // caller guarantees that the arrays really are different
2519    // otherwise, we would have to make conjoint checks
2520    { Label L;
2521      __ mov(O3, G1);           // spill: overlap test smashes O3
2522      __ mov(O4, G4);           // spill: overlap test smashes O4
2523      array_overlap_test(L, LogBytesPerHeapOop);
2524      __ stop("checkcast_copy within a single array");
2525      __ bind(L);
2526      __ mov(G1, O3);
2527      __ mov(G4, O4);
2528    }
2529#endif //ASSERT
2530
2531    if (entry != NULL) {
2532      *entry = __ pc();
2533      // caller can pass a 64-bit byte count here (from generic stub)
2534      BLOCK_COMMENT("Entry:");
2535    }
2536    gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized);
2537
2538    Label load_element, store_element, do_card_marks, fail, done;
2539    __ addcc(O2_count, 0, G1_remain);   // initialize loop index, and test it
2540    __ brx(Assembler::notZero, false, Assembler::pt, load_element);
2541    __ delayed()->mov(G0, O5_offset);   // offset from start of arrays
2542
2543    // Empty array:  Nothing to do.
2544    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2545    __ retl();
2546    __ delayed()->set(0, O0);           // return 0 on (trivial) success
2547
2548    // ======== begin loop ========
2549    // (Loop is rotated; its entry is load_element.)
2550    // Loop variables:
2551    //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
2552    //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
2553    //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
2554    __ align(OptoLoopAlignment);
2555
2556    __ BIND(store_element);
2557    __ deccc(G1_remain);                // decrement the count
2558    __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
2559    __ inc(O5_offset, heapOopSize);     // step to next offset
2560    __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
2561    __ delayed()->set(0, O0);           // return -1 on success
2562
2563    // ======== loop entry is here ========
2564    __ BIND(load_element);
2565    __ load_heap_oop(O0_from, O5_offset, G3_oop);  // load the oop
2566    __ br_null_short(G3_oop, Assembler::pt, store_element);
2567
2568    __ load_klass(G3_oop, G4_klass); // query the object klass
2569
2570    generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
2571                        // branch to this on success:
2572                        store_element);
2573    // ======== end loop ========
2574
2575    // It was a real error; we must depend on the caller to finish the job.
2576    // Register G1 has number of *remaining* oops, O2 number of *total* oops.
2577    // Emit GC store barriers for the oops we have copied (O2 minus G1),
2578    // and report their number to the caller.
2579    __ BIND(fail);
2580    __ subcc(O2_count, G1_remain, O2_count);
2581    __ brx(Assembler::zero, false, Assembler::pt, done);
2582    __ delayed()->not1(O2_count, O0);   // report (-1^K) to caller
2583
2584    __ BIND(do_card_marks);
2585    gen_write_ref_array_post_barrier(O1_to, O2_count, O3);   // store check on O1[0..O2]
2586
2587    __ BIND(done);
2588    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2589    __ retl();
2590    __ delayed()->nop();             // return value in 00
2591
2592    return start;
2593  }
2594
2595
2596  //  Generate 'unsafe' array copy stub
2597  //  Though just as safe as the other stubs, it takes an unscaled
2598  //  size_t argument instead of an element count.
2599  //
2600  // Arguments for generated stub:
2601  //      from:  O0
2602  //      to:    O1
2603  //      count: O2 byte count, treated as ssize_t, can be zero
2604  //
2605  // Examines the alignment of the operands and dispatches
2606  // to a long, int, short, or byte copy loop.
2607  //
2608  address generate_unsafe_copy(const char* name,
2609                               address byte_copy_entry,
2610                               address short_copy_entry,
2611                               address int_copy_entry,
2612                               address long_copy_entry) {
2613
2614    const Register O0_from   = O0;      // source array address
2615    const Register O1_to     = O1;      // destination array address
2616    const Register O2_count  = O2;      // elements count
2617
2618    const Register G1_bits   = G1;      // test copy of low bits
2619
2620    __ align(CodeEntryAlignment);
2621    StubCodeMark mark(this, "StubRoutines", name);
2622    address start = __ pc();
2623
2624    // bump this on entry, not on exit:
2625    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
2626
2627    __ or3(O0_from, O1_to, G1_bits);
2628    __ or3(O2_count,       G1_bits, G1_bits);
2629
2630    __ btst(BytesPerLong-1, G1_bits);
2631    __ br(Assembler::zero, true, Assembler::pt,
2632          long_copy_entry, relocInfo::runtime_call_type);
2633    // scale the count on the way out:
2634    __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
2635
2636    __ btst(BytesPerInt-1, G1_bits);
2637    __ br(Assembler::zero, true, Assembler::pt,
2638          int_copy_entry, relocInfo::runtime_call_type);
2639    // scale the count on the way out:
2640    __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
2641
2642    __ btst(BytesPerShort-1, G1_bits);
2643    __ br(Assembler::zero, true, Assembler::pt,
2644          short_copy_entry, relocInfo::runtime_call_type);
2645    // scale the count on the way out:
2646    __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
2647
2648    __ br(Assembler::always, false, Assembler::pt,
2649          byte_copy_entry, relocInfo::runtime_call_type);
2650    __ delayed()->nop();
2651
2652    return start;
2653  }
2654
2655
2656  // Perform range checks on the proposed arraycopy.
2657  // Kills the two temps, but nothing else.
2658  // Also, clean the sign bits of src_pos and dst_pos.
2659  void arraycopy_range_checks(Register src,     // source array oop (O0)
2660                              Register src_pos, // source position (O1)
2661                              Register dst,     // destination array oo (O2)
2662                              Register dst_pos, // destination position (O3)
2663                              Register length,  // length of copy (O4)
2664                              Register temp1, Register temp2,
2665                              Label& L_failed) {
2666    BLOCK_COMMENT("arraycopy_range_checks:");
2667
2668    //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2669
2670    const Register array_length = temp1;  // scratch
2671    const Register end_pos      = temp2;  // scratch
2672
2673    // Note:  This next instruction may be in the delay slot of a branch:
2674    __ add(length, src_pos, end_pos);  // src_pos + length
2675    __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
2676    __ cmp(end_pos, array_length);
2677    __ br(Assembler::greater, false, Assembler::pn, L_failed);
2678
2679    //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2680    __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
2681    __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
2682    __ cmp(end_pos, array_length);
2683    __ br(Assembler::greater, false, Assembler::pn, L_failed);
2684
2685    // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2686    // Move with sign extension can be used since they are positive.
2687    __ delayed()->signx(src_pos, src_pos);
2688    __ signx(dst_pos, dst_pos);
2689
2690    BLOCK_COMMENT("arraycopy_range_checks done");
2691  }
2692
2693
2694  //
2695  //  Generate generic array copy stubs
2696  //
2697  //  Input:
2698  //    O0    -  src oop
2699  //    O1    -  src_pos
2700  //    O2    -  dst oop
2701  //    O3    -  dst_pos
2702  //    O4    -  element count
2703  //
2704  //  Output:
2705  //    O0 ==  0  -  success
2706  //    O0 == -1  -  need to call System.arraycopy
2707  //
2708  address generate_generic_copy(const char *name,
2709                                address entry_jbyte_arraycopy,
2710                                address entry_jshort_arraycopy,
2711                                address entry_jint_arraycopy,
2712                                address entry_oop_arraycopy,
2713                                address entry_jlong_arraycopy,
2714                                address entry_checkcast_arraycopy) {
2715    Label L_failed, L_objArray;
2716
2717    // Input registers
2718    const Register src      = O0;  // source array oop
2719    const Register src_pos  = O1;  // source position
2720    const Register dst      = O2;  // destination array oop
2721    const Register dst_pos  = O3;  // destination position
2722    const Register length   = O4;  // elements count
2723
2724    // registers used as temp
2725    const Register G3_src_klass = G3; // source array klass
2726    const Register G4_dst_klass = G4; // destination array klass
2727    const Register G5_lh        = G5; // layout handler
2728    const Register O5_temp      = O5;
2729
2730    __ align(CodeEntryAlignment);
2731    StubCodeMark mark(this, "StubRoutines", name);
2732    address start = __ pc();
2733
2734    // bump this on entry, not on exit:
2735    inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
2736
2737    // In principle, the int arguments could be dirty.
2738    //assert_clean_int(src_pos, G1);
2739    //assert_clean_int(dst_pos, G1);
2740    //assert_clean_int(length, G1);
2741
2742    //-----------------------------------------------------------------------
2743    // Assembler stubs will be used for this call to arraycopy
2744    // if the following conditions are met:
2745    //
2746    // (1) src and dst must not be null.
2747    // (2) src_pos must not be negative.
2748    // (3) dst_pos must not be negative.
2749    // (4) length  must not be negative.
2750    // (5) src klass and dst klass should be the same and not NULL.
2751    // (6) src and dst should be arrays.
2752    // (7) src_pos + length must not exceed length of src.
2753    // (8) dst_pos + length must not exceed length of dst.
2754    BLOCK_COMMENT("arraycopy initial argument checks");
2755
2756    //  if (src == NULL) return -1;
2757    __ br_null(src, false, Assembler::pn, L_failed);
2758
2759    //  if (src_pos < 0) return -1;
2760    __ delayed()->tst(src_pos);
2761    __ br(Assembler::negative, false, Assembler::pn, L_failed);
2762    __ delayed()->nop();
2763
2764    //  if (dst == NULL) return -1;
2765    __ br_null(dst, false, Assembler::pn, L_failed);
2766
2767    //  if (dst_pos < 0) return -1;
2768    __ delayed()->tst(dst_pos);
2769    __ br(Assembler::negative, false, Assembler::pn, L_failed);
2770
2771    //  if (length < 0) return -1;
2772    __ delayed()->tst(length);
2773    __ br(Assembler::negative, false, Assembler::pn, L_failed);
2774
2775    BLOCK_COMMENT("arraycopy argument klass checks");
2776    //  get src->klass()
2777    if (UseCompressedClassPointers) {
2778      __ delayed()->nop(); // ??? not good
2779      __ load_klass(src, G3_src_klass);
2780    } else {
2781      __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
2782    }
2783
2784#ifdef ASSERT
2785    //  assert(src->klass() != NULL);
2786    BLOCK_COMMENT("assert klasses not null");
2787    { Label L_a, L_b;
2788      __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL
2789      __ bind(L_a);
2790      __ stop("broken null klass");
2791      __ bind(L_b);
2792      __ load_klass(dst, G4_dst_klass);
2793      __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
2794      __ delayed()->mov(G0, G4_dst_klass);      // scribble the temp
2795      BLOCK_COMMENT("assert done");
2796    }
2797#endif
2798
2799    // Load layout helper
2800    //
2801    //  |array_tag|     | header_size | element_type |     |log2_element_size|
2802    // 32        30    24            16              8     2                 0
2803    //
2804    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2805    //
2806
2807    int lh_offset = in_bytes(Klass::layout_helper_offset());
2808
2809    // Load 32-bits signed value. Use br() instruction with it to check icc.
2810    __ lduw(G3_src_klass, lh_offset, G5_lh);
2811
2812    if (UseCompressedClassPointers) {
2813      __ load_klass(dst, G4_dst_klass);
2814    }
2815    // Handle objArrays completely differently...
2816    juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2817    __ set(objArray_lh, O5_temp);
2818    __ cmp(G5_lh,       O5_temp);
2819    __ br(Assembler::equal, false, Assembler::pt, L_objArray);
2820    if (UseCompressedClassPointers) {
2821      __ delayed()->nop();
2822    } else {
2823      __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
2824    }
2825
2826    //  if (src->klass() != dst->klass()) return -1;
2827    __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed);
2828
2829    //  if (!src->is_Array()) return -1;
2830    __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
2831    __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
2832
2833    // At this point, it is known to be a typeArray (array_tag 0x3).
2834#ifdef ASSERT
2835    __ delayed()->nop();
2836    { Label L;
2837      jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2838      __ set(lh_prim_tag_in_place, O5_temp);
2839      __ cmp(G5_lh,                O5_temp);
2840      __ br(Assembler::greaterEqual, false, Assembler::pt, L);
2841      __ delayed()->nop();
2842      __ stop("must be a primitive array");
2843      __ bind(L);
2844    }
2845#else
2846    __ delayed();                               // match next insn to prev branch
2847#endif
2848
2849    arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2850                           O5_temp, G4_dst_klass, L_failed);
2851
2852    // TypeArrayKlass
2853    //
2854    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2855    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2856    //
2857
2858    const Register G4_offset = G4_dst_klass;    // array offset
2859    const Register G3_elsize = G3_src_klass;    // log2 element size
2860
2861    __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
2862    __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
2863    __ add(src, G4_offset, src);       // src array offset
2864    __ add(dst, G4_offset, dst);       // dst array offset
2865    __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
2866
2867    // next registers should be set before the jump to corresponding stub
2868    const Register from     = O0;  // source array address
2869    const Register to       = O1;  // destination array address
2870    const Register count    = O2;  // elements count
2871
2872    // 'from', 'to', 'count' registers should be set in this order
2873    // since they are the same as 'src', 'src_pos', 'dst'.
2874
2875    BLOCK_COMMENT("scale indexes to element size");
2876    __ sll_ptr(src_pos, G3_elsize, src_pos);
2877    __ sll_ptr(dst_pos, G3_elsize, dst_pos);
2878    __ add(src, src_pos, from);       // src_addr
2879    __ add(dst, dst_pos, to);         // dst_addr
2880
2881    BLOCK_COMMENT("choose copy loop based on element size");
2882    __ cmp(G3_elsize, 0);
2883    __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
2884    __ delayed()->signx(length, count); // length
2885
2886    __ cmp(G3_elsize, LogBytesPerShort);
2887    __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
2888    __ delayed()->signx(length, count); // length
2889
2890    __ cmp(G3_elsize, LogBytesPerInt);
2891    __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
2892    __ delayed()->signx(length, count); // length
2893#ifdef ASSERT
2894    { Label L;
2895      __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L);
2896      __ stop("must be long copy, but elsize is wrong");
2897      __ bind(L);
2898    }
2899#endif
2900    __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
2901    __ delayed()->signx(length, count); // length
2902
2903    // ObjArrayKlass
2904  __ BIND(L_objArray);
2905    // live at this point:  G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
2906
2907    Label L_plain_copy, L_checkcast_copy;
2908    //  test array classes for subtyping
2909    __ cmp(G3_src_klass, G4_dst_klass);         // usual case is exact equality
2910    __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
2911    __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
2912
2913    // Identically typed arrays can be copied without element-wise checks.
2914    arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2915                           O5_temp, G5_lh, L_failed);
2916
2917    __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
2918    __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
2919    __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
2920    __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
2921    __ add(src, src_pos, from);       // src_addr
2922    __ add(dst, dst_pos, to);         // dst_addr
2923  __ BIND(L_plain_copy);
2924    __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
2925    __ delayed()->signx(length, count); // length
2926
2927  __ BIND(L_checkcast_copy);
2928    // live at this point:  G3_src_klass, G4_dst_klass
2929    {
2930      // Before looking at dst.length, make sure dst is also an objArray.
2931      // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
2932      __ cmp(G5_lh,                    O5_temp);
2933      __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
2934
2935      // It is safe to examine both src.length and dst.length.
2936      __ delayed();                             // match next insn to prev branch
2937      arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2938                             O5_temp, G5_lh, L_failed);
2939
2940      // Marshal the base address arguments now, freeing registers.
2941      __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
2942      __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
2943      __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
2944      __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
2945      __ add(src, src_pos, from);               // src_addr
2946      __ add(dst, dst_pos, to);                 // dst_addr
2947      __ signx(length, count);                  // length (reloaded)
2948
2949      Register sco_temp = O3;                   // this register is free now
2950      assert_different_registers(from, to, count, sco_temp,
2951                                 G4_dst_klass, G3_src_klass);
2952
2953      // Generate the type check.
2954      int sco_offset = in_bytes(Klass::super_check_offset_offset());
2955      __ lduw(G4_dst_klass, sco_offset, sco_temp);
2956      generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
2957                          O5_temp, L_plain_copy);
2958
2959      // Fetch destination element klass from the ObjArrayKlass header.
2960      int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2961
2962      // the checkcast_copy loop needs two extra arguments:
2963      __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
2964      // lduw(O4, sco_offset, O3);              // sco of elem klass
2965
2966      __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
2967      __ delayed()->lduw(O4, sco_offset, O3);
2968    }
2969
2970  __ BIND(L_failed);
2971    __ retl();
2972    __ delayed()->sub(G0, 1, O0); // return -1
2973    return start;
2974  }
2975
2976  //
2977  //  Generate stub for heap zeroing.
2978  //  "to" address is aligned to jlong (8 bytes).
2979  //
2980  // Arguments for generated stub:
2981  //      to:    O0
2982  //      count: O1 treated as signed (count of HeapWord)
2983  //             count could be 0
2984  //
2985  address generate_zero_aligned_words(const char* name) {
2986    __ align(CodeEntryAlignment);
2987    StubCodeMark mark(this, "StubRoutines", name);
2988    address start = __ pc();
2989
2990    const Register to    = O0;   // source array address
2991    const Register count = O1;   // HeapWords count
2992    const Register temp  = O2;   // scratch
2993
2994    Label Ldone;
2995    __ sllx(count, LogHeapWordSize, count); // to bytes count
2996    // Use BIS for zeroing
2997    __ bis_zeroing(to, count, temp, Ldone);
2998    __ bind(Ldone);
2999    __ retl();
3000    __ delayed()->nop();
3001    return start;
3002}
3003
3004  void generate_arraycopy_stubs() {
3005    address entry;
3006    address entry_jbyte_arraycopy;
3007    address entry_jshort_arraycopy;
3008    address entry_jint_arraycopy;
3009    address entry_oop_arraycopy;
3010    address entry_jlong_arraycopy;
3011    address entry_checkcast_arraycopy;
3012
3013    //*** jbyte
3014    // Always need aligned and unaligned versions
3015    StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
3016                                                                                  "jbyte_disjoint_arraycopy");
3017    StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
3018                                                                                  &entry_jbyte_arraycopy,
3019                                                                                  "jbyte_arraycopy");
3020    StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
3021                                                                                  "arrayof_jbyte_disjoint_arraycopy");
3022    StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
3023                                                                                  "arrayof_jbyte_arraycopy");
3024
3025    //*** jshort
3026    // Always need aligned and unaligned versions
3027    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
3028                                                                                    "jshort_disjoint_arraycopy");
3029    StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
3030                                                                                    &entry_jshort_arraycopy,
3031                                                                                    "jshort_arraycopy");
3032    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
3033                                                                                    "arrayof_jshort_disjoint_arraycopy");
3034    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
3035                                                                                    "arrayof_jshort_arraycopy");
3036
3037    //*** jint
3038    // Aligned versions
3039    StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
3040                                                                                "arrayof_jint_disjoint_arraycopy");
3041    StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
3042                                                                                "arrayof_jint_arraycopy");
3043    // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3044    // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
3045    StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
3046                                                                                "jint_disjoint_arraycopy");
3047    StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
3048                                                                                &entry_jint_arraycopy,
3049                                                                                "jint_arraycopy");
3050
3051    //*** jlong
3052    // It is always aligned
3053    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
3054                                                                                  "arrayof_jlong_disjoint_arraycopy");
3055    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
3056                                                                                  "arrayof_jlong_arraycopy");
3057    StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3058    StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
3059
3060
3061    //*** oops
3062    // Aligned versions
3063    StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, &entry,
3064                                                                                      "arrayof_oop_disjoint_arraycopy");
3065    StubRoutines::_arrayof_oop_arraycopy                 = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
3066                                                                                      "arrayof_oop_arraycopy");
3067    // Aligned versions without pre-barriers
3068    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
3069                                                                                      "arrayof_oop_disjoint_arraycopy_uninit",
3070                                                                                      /*dest_uninitialized*/true);
3071    StubRoutines::_arrayof_oop_arraycopy_uninit          = generate_conjoint_oop_copy(true, entry, NULL,
3072                                                                                      "arrayof_oop_arraycopy_uninit",
3073                                                                                      /*dest_uninitialized*/true);
3074    if (UseCompressedOops) {
3075      // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
3076      StubRoutines::_oop_disjoint_arraycopy            = generate_disjoint_oop_copy(false, &entry,
3077                                                                                    "oop_disjoint_arraycopy");
3078      StubRoutines::_oop_arraycopy                     = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
3079                                                                                    "oop_arraycopy");
3080      // Unaligned versions without pre-barriers
3081      StubRoutines::_oop_disjoint_arraycopy_uninit     = generate_disjoint_oop_copy(false, &entry,
3082                                                                                    "oop_disjoint_arraycopy_uninit",
3083                                                                                    /*dest_uninitialized*/true);
3084      StubRoutines::_oop_arraycopy_uninit              = generate_conjoint_oop_copy(false, entry, NULL,
3085                                                                                    "oop_arraycopy_uninit",
3086                                                                                    /*dest_uninitialized*/true);
3087    } else {
3088      // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
3089      StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3090      StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
3091      StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3092      StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
3093    }
3094
3095    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3096    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3097                                                                        /*dest_uninitialized*/true);
3098
3099    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3100                                                              entry_jbyte_arraycopy,
3101                                                              entry_jshort_arraycopy,
3102                                                              entry_jint_arraycopy,
3103                                                              entry_jlong_arraycopy);
3104    StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3105                                                               entry_jbyte_arraycopy,
3106                                                               entry_jshort_arraycopy,
3107                                                               entry_jint_arraycopy,
3108                                                               entry_oop_arraycopy,
3109                                                               entry_jlong_arraycopy,
3110                                                               entry_checkcast_arraycopy);
3111
3112    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3113    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3114    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3115    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3116    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3117    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3118
3119    if (UseBlockZeroing) {
3120      StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
3121    }
3122  }
3123
3124  address generate_aescrypt_encryptBlock() {
3125    // required since we read expanded key 'int' array starting first element without alignment considerations
3126    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3127           "the following code assumes that first element of an int array is aligned to 8 bytes");
3128    __ align(CodeEntryAlignment);
3129    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3130    Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output;
3131    address start = __ pc();
3132    Register from = O0; // source byte array
3133    Register to = O1;   // destination byte array
3134    Register key = O2;  // expanded key array
3135    const Register keylen = O4; //reg for storing expanded key array length
3136
3137    // read expanded key length
3138    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3139
3140    // Method to address arbitrary alignment for load instructions:
3141    // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary
3142    // If zero/aligned then continue with double FP load instructions
3143    // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata
3144    // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address
3145    // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address
3146    // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs
3147
3148    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3149    __ andcc(from, 7, G0);
3150    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
3151    __ delayed()->alignaddr(from, G0, from);
3152
3153    // aligned case: load input into F54-F56
3154    __ ldf(FloatRegisterImpl::D, from, 0, F54);
3155    __ ldf(FloatRegisterImpl::D, from, 8, F56);
3156    __ ba_short(L_load_expanded_key);
3157
3158    __ BIND(L_load_misaligned_input);
3159    __ ldf(FloatRegisterImpl::D, from, 0, F54);
3160    __ ldf(FloatRegisterImpl::D, from, 8, F56);
3161    __ ldf(FloatRegisterImpl::D, from, 16, F58);
3162    __ faligndata(F54, F56, F54);
3163    __ faligndata(F56, F58, F56);
3164
3165    __ BIND(L_load_expanded_key);
3166    // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed
3167    for ( int i = 0;  i <= 38; i += 2 ) {
3168      __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i));
3169    }
3170
3171    // perform cipher transformation
3172    __ fxor(FloatRegisterImpl::D, F0, F54, F54);
3173    __ fxor(FloatRegisterImpl::D, F2, F56, F56);
3174    // rounds 1 through 8
3175    for ( int i = 4;  i <= 28; i += 8 ) {
3176      __ aes_eround01(as_FloatRegister(i), F54, F56, F58);
3177      __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60);
3178      __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54);
3179      __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56);
3180    }
3181    __ aes_eround01(F36, F54, F56, F58); //round 9
3182    __ aes_eround23(F38, F54, F56, F60);
3183
3184    // 128-bit original key size
3185    __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit);
3186
3187    for ( int i = 40;  i <= 50; i += 2 ) {
3188      __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) );
3189    }
3190    __ aes_eround01(F40, F58, F60, F54); //round 10
3191    __ aes_eround23(F42, F58, F60, F56);
3192    __ aes_eround01(F44, F54, F56, F58); //round 11
3193    __ aes_eround23(F46, F54, F56, F60);
3194
3195    // 192-bit original key size
3196    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput);
3197
3198    __ ldf(FloatRegisterImpl::D, key, 208, F52);
3199    __ aes_eround01(F48, F58, F60, F54); //round 12
3200    __ aes_eround23(F50, F58, F60, F56);
3201    __ ldf(FloatRegisterImpl::D, key, 216, F46);
3202    __ ldf(FloatRegisterImpl::D, key, 224, F48);
3203    __ ldf(FloatRegisterImpl::D, key, 232, F50);
3204    __ aes_eround01(F52, F54, F56, F58); //round 13
3205    __ aes_eround23(F46, F54, F56, F60);
3206    __ ba_short(L_storeOutput);
3207
3208    __ BIND(L_doLast128bit);
3209    __ ldf(FloatRegisterImpl::D, key, 160, F48);
3210    __ ldf(FloatRegisterImpl::D, key, 168, F50);
3211
3212    __ BIND(L_storeOutput);
3213    // perform last round of encryption common for all key sizes
3214    __ aes_eround01_l(F48, F58, F60, F54); //last round
3215    __ aes_eround23_l(F50, F58, F60, F56);
3216
3217    // Method to address arbitrary alignment for store instructions:
3218    // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary
3219    // If zero/aligned then continue with double FP store instructions
3220    // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case)
3221    // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001
3222    // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case
3223    // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case.
3224    // Set GSR.align to (8-n) using alignaddr
3225    // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf
3226    // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address
3227    // Store (partial) the original first (8-n) bytes starting at the original 'dest' address
3228    // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address
3229    // We need to execute this process for both the 8-byte result values
3230
3231    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3232    __ andcc(to, 7, O5);
3233    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
3234    __ delayed()->edge8n(to, G0, O3);
3235
3236    // aligned case: store output into the destination array
3237    __ stf(FloatRegisterImpl::D, F54, to, 0);
3238    __ retl();
3239    __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8);
3240
3241    __ BIND(L_store_misaligned_output);
3242    __ add(to, 8, O4);
3243    __ mov(8, O2);
3244    __ sub(O2, O5, O2);
3245    __ alignaddr(O2, G0, O2);
3246    __ faligndata(F54, F54, F54);
3247    __ faligndata(F56, F56, F56);
3248    __ and3(to, -8, to);
3249    __ and3(O4, -8, O4);
3250    __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
3251    __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
3252    __ add(to, 8, to);
3253    __ add(O4, 8, O4);
3254    __ orn(G0, O3, O3);
3255    __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
3256    __ retl();
3257    __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
3258
3259    return start;
3260  }
3261
3262  address generate_aescrypt_decryptBlock() {
3263    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3264           "the following code assumes that first element of an int array is aligned to 8 bytes");
3265    // required since we read original key 'byte' array as well in the decryption stubs
3266    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3267           "the following code assumes that first element of a byte array is aligned to 8 bytes");
3268    __ align(CodeEntryAlignment);
3269    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3270    address start = __ pc();
3271    Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input;
3272    Label L_256bit_transform, L_common_transform, L_store_misaligned_output;
3273    Register from = O0; // source byte array
3274    Register to = O1;   // destination byte array
3275    Register key = O2;  // expanded key array
3276    Register original_key = O3;  // original key array only required during decryption
3277    const Register keylen = O4;  // reg for storing expanded key array length
3278
3279    // read expanded key array length
3280    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3281
3282    // save 'from' since we may need to recheck alignment in case of 256-bit decryption
3283    __ mov(from, G1);
3284
3285    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3286    __ andcc(from, 7, G0);
3287    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
3288    __ delayed()->alignaddr(from, G0, from);
3289
3290    // aligned case: load input into F52-F54
3291    __ ldf(FloatRegisterImpl::D, from, 0, F52);
3292    __ ldf(FloatRegisterImpl::D, from, 8, F54);
3293    __ ba_short(L_load_original_key);
3294
3295    __ BIND(L_load_misaligned_input);
3296    __ ldf(FloatRegisterImpl::D, from, 0, F52);
3297    __ ldf(FloatRegisterImpl::D, from, 8, F54);
3298    __ ldf(FloatRegisterImpl::D, from, 16, F56);
3299    __ faligndata(F52, F54, F52);
3300    __ faligndata(F54, F56, F54);
3301
3302    __ BIND(L_load_original_key);
3303    // load original key from SunJCE expanded decryption key
3304    // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
3305    for ( int i = 0;  i <= 3; i++ ) {
3306      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3307    }
3308
3309    // 256-bit original key size
3310    __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
3311
3312    // 192-bit original key size
3313    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
3314
3315    // 128-bit original key size
3316    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3317    for ( int i = 0;  i <= 36; i += 4 ) {
3318      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
3319      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
3320    }
3321
3322    // perform 128-bit key specific inverse cipher transformation
3323    __ fxor(FloatRegisterImpl::D, F42, F54, F54);
3324    __ fxor(FloatRegisterImpl::D, F40, F52, F52);
3325    __ ba_short(L_common_transform);
3326
3327    __ BIND(L_expand192bit);
3328
3329    // start loading rest of the 192-bit key
3330    __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
3331    __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
3332
3333    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3334    for ( int i = 0;  i <= 36; i += 6 ) {
3335      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
3336      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
3337      __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
3338    }
3339    __ aes_kexpand1(F42, F46, 7, F48);
3340    __ aes_kexpand2(F44, F48, F50);
3341
3342    // perform 192-bit key specific inverse cipher transformation
3343    __ fxor(FloatRegisterImpl::D, F50, F54, F54);
3344    __ fxor(FloatRegisterImpl::D, F48, F52, F52);
3345    __ aes_dround23(F46, F52, F54, F58);
3346    __ aes_dround01(F44, F52, F54, F56);
3347    __ aes_dround23(F42, F56, F58, F54);
3348    __ aes_dround01(F40, F56, F58, F52);
3349    __ ba_short(L_common_transform);
3350
3351    __ BIND(L_expand256bit);
3352
3353    // load rest of the 256-bit key
3354    for ( int i = 4;  i <= 7; i++ ) {
3355      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3356    }
3357
3358    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3359    for ( int i = 0;  i <= 40; i += 8 ) {
3360      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
3361      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
3362      __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
3363      __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
3364    }
3365    __ aes_kexpand1(F48, F54, 6, F56);
3366    __ aes_kexpand2(F50, F56, F58);
3367
3368    for ( int i = 0;  i <= 6; i += 2 ) {
3369      __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
3370    }
3371
3372    // reload original 'from' address
3373    __ mov(G1, from);
3374
3375    // re-check 8-byte alignment
3376    __ andcc(from, 7, G0);
3377    __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input);
3378    __ delayed()->alignaddr(from, G0, from);
3379
3380    // aligned case: load input into F52-F54
3381    __ ldf(FloatRegisterImpl::D, from, 0, F52);
3382    __ ldf(FloatRegisterImpl::D, from, 8, F54);
3383    __ ba_short(L_256bit_transform);
3384
3385    __ BIND(L_reload_misaligned_input);
3386    __ ldf(FloatRegisterImpl::D, from, 0, F52);
3387    __ ldf(FloatRegisterImpl::D, from, 8, F54);
3388    __ ldf(FloatRegisterImpl::D, from, 16, F56);
3389    __ faligndata(F52, F54, F52);
3390    __ faligndata(F54, F56, F54);
3391
3392    // perform 256-bit key specific inverse cipher transformation
3393    __ BIND(L_256bit_transform);
3394    __ fxor(FloatRegisterImpl::D, F0, F54, F54);
3395    __ fxor(FloatRegisterImpl::D, F2, F52, F52);
3396    __ aes_dround23(F4, F52, F54, F58);
3397    __ aes_dround01(F6, F52, F54, F56);
3398    __ aes_dround23(F50, F56, F58, F54);
3399    __ aes_dround01(F48, F56, F58, F52);
3400    __ aes_dround23(F46, F52, F54, F58);
3401    __ aes_dround01(F44, F52, F54, F56);
3402    __ aes_dround23(F42, F56, F58, F54);
3403    __ aes_dround01(F40, F56, F58, F52);
3404
3405    for ( int i = 0;  i <= 7; i++ ) {
3406      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3407    }
3408
3409    // perform inverse cipher transformations common for all key sizes
3410    __ BIND(L_common_transform);
3411    for ( int i = 38;  i >= 6; i -= 8 ) {
3412      __ aes_dround23(as_FloatRegister(i), F52, F54, F58);
3413      __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56);
3414      if ( i != 6) {
3415        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54);
3416        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52);
3417      } else {
3418        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54);
3419        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52);
3420      }
3421    }
3422
3423    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3424    __ andcc(to, 7, O5);
3425    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
3426    __ delayed()->edge8n(to, G0, O3);
3427
3428    // aligned case: store output into the destination array
3429    __ stf(FloatRegisterImpl::D, F52, to, 0);
3430    __ retl();
3431    __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8);
3432
3433    __ BIND(L_store_misaligned_output);
3434    __ add(to, 8, O4);
3435    __ mov(8, O2);
3436    __ sub(O2, O5, O2);
3437    __ alignaddr(O2, G0, O2);
3438    __ faligndata(F52, F52, F52);
3439    __ faligndata(F54, F54, F54);
3440    __ and3(to, -8, to);
3441    __ and3(O4, -8, O4);
3442    __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
3443    __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
3444    __ add(to, 8, to);
3445    __ add(O4, 8, O4);
3446    __ orn(G0, O3, O3);
3447    __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
3448    __ retl();
3449    __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
3450
3451    return start;
3452  }
3453
3454  address generate_cipherBlockChaining_encryptAESCrypt() {
3455    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3456           "the following code assumes that first element of an int array is aligned to 8 bytes");
3457    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3458           "the following code assumes that first element of a byte array is aligned to 8 bytes");
3459    __ align(CodeEntryAlignment);
3460    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3461    Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit;
3462    Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform;
3463    Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit;
3464    Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit;
3465    address start = __ pc();
3466    Register from = I0; // source byte array
3467    Register to = I1;   // destination byte array
3468    Register key = I2;  // expanded key array
3469    Register rvec = I3; // init vector
3470    const Register len_reg = I4; // cipher length
3471    const Register keylen = I5;  // reg for storing expanded key array length
3472
3473    __ save_frame(0);
3474    // save cipher len to return in the end
3475    __ mov(len_reg, L0);
3476
3477    // read expanded key length
3478    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3479
3480    // load initial vector, 8-byte alignment is guranteed
3481    __ ldf(FloatRegisterImpl::D, rvec, 0, F60);
3482    __ ldf(FloatRegisterImpl::D, rvec, 8, F62);
3483    // load key, 8-byte alignment is guranteed
3484    __ ldx(key,0,G1);
3485    __ ldx(key,8,G5);
3486
3487    // start loading expanded key, 8-byte alignment is guranteed
3488    for ( int i = 0, j = 16;  i <= 38; i += 2, j += 8 ) {
3489      __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3490    }
3491
3492    // 128-bit original key size
3493    __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128);
3494
3495    for ( int i = 40, j = 176;  i <= 46; i += 2, j += 8 ) {
3496      __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3497    }
3498
3499    // 192-bit original key size
3500    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192);
3501
3502    for ( int i = 48, j = 208;  i <= 54; i += 2, j += 8 ) {
3503      __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3504    }
3505
3506    // 256-bit original key size
3507    __ ba_short(L_cbcenc256);
3508
3509    __ align(OptoLoopAlignment);
3510    __ BIND(L_cbcenc128);
3511    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3512    __ andcc(from, 7, G0);
3513    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit);
3514    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3515
3516    // aligned case: load input into G3 and G4
3517    __ ldx(from,0,G3);
3518    __ ldx(from,8,G4);
3519    __ ba_short(L_128bit_transform);
3520
3521    __ BIND(L_load_misaligned_input_128bit);
3522    // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
3523    __ alignaddr(from, G0, from);
3524    __ ldf(FloatRegisterImpl::D, from, 0, F48);
3525    __ ldf(FloatRegisterImpl::D, from, 8, F50);
3526    __ ldf(FloatRegisterImpl::D, from, 16, F52);
3527    __ faligndata(F48, F50, F48);
3528    __ faligndata(F50, F52, F50);
3529    __ movdtox(F48, G3);
3530    __ movdtox(F50, G4);
3531    __ mov(L1, from);
3532
3533    __ BIND(L_128bit_transform);
3534    __ xor3(G1,G3,G3);
3535    __ xor3(G5,G4,G4);
3536    __ movxtod(G3,F56);
3537    __ movxtod(G4,F58);
3538    __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3539    __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3540
3541    // TEN_EROUNDS
3542    for ( int i = 0;  i <= 32; i += 8 ) {
3543      __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3544      __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3545      if (i != 32 ) {
3546        __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3547        __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3548      } else {
3549        __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3550        __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3551      }
3552    }
3553
3554    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3555    __ andcc(to, 7, L1);
3556    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit);
3557    __ delayed()->edge8n(to, G0, L2);
3558
3559    // aligned case: store output into the destination array
3560    __ stf(FloatRegisterImpl::D, F60, to, 0);
3561    __ stf(FloatRegisterImpl::D, F62, to, 8);
3562    __ ba_short(L_check_loop_end_128bit);
3563
3564    __ BIND(L_store_misaligned_output_128bit);
3565    __ add(to, 8, L3);
3566    __ mov(8, L4);
3567    __ sub(L4, L1, L4);
3568    __ alignaddr(L4, G0, L4);
3569    // save cipher text before circular right shift
3570    // as it needs to be stored as iv for next block (see code before next retl)
3571    __ movdtox(F60, L6);
3572    __ movdtox(F62, L7);
3573    __ faligndata(F60, F60, F60);
3574    __ faligndata(F62, F62, F62);
3575    __ mov(to, L5);
3576    __ and3(to, -8, to);
3577    __ and3(L3, -8, L3);
3578    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3579    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3580    __ add(to, 8, to);
3581    __ add(L3, 8, L3);
3582    __ orn(G0, L2, L2);
3583    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3584    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3585    __ mov(L5, to);
3586    __ movxtod(L6, F60);
3587    __ movxtod(L7, F62);
3588
3589    __ BIND(L_check_loop_end_128bit);
3590    __ add(from, 16, from);
3591    __ add(to, 16, to);
3592    __ subcc(len_reg, 16, len_reg);
3593    __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128);
3594    __ delayed()->nop();
3595    // re-init intial vector for next block, 8-byte alignment is guaranteed
3596    __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3597    __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3598    __ mov(L0, I0);
3599    __ ret();
3600    __ delayed()->restore();
3601
3602    __ align(OptoLoopAlignment);
3603    __ BIND(L_cbcenc192);
3604    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3605    __ andcc(from, 7, G0);
3606    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit);
3607    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3608
3609    // aligned case: load input into G3 and G4
3610    __ ldx(from,0,G3);
3611    __ ldx(from,8,G4);
3612    __ ba_short(L_192bit_transform);
3613
3614    __ BIND(L_load_misaligned_input_192bit);
3615    // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
3616    __ alignaddr(from, G0, from);
3617    __ ldf(FloatRegisterImpl::D, from, 0, F48);
3618    __ ldf(FloatRegisterImpl::D, from, 8, F50);
3619    __ ldf(FloatRegisterImpl::D, from, 16, F52);
3620    __ faligndata(F48, F50, F48);
3621    __ faligndata(F50, F52, F50);
3622    __ movdtox(F48, G3);
3623    __ movdtox(F50, G4);
3624    __ mov(L1, from);
3625
3626    __ BIND(L_192bit_transform);
3627    __ xor3(G1,G3,G3);
3628    __ xor3(G5,G4,G4);
3629    __ movxtod(G3,F56);
3630    __ movxtod(G4,F58);
3631    __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3632    __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3633
3634    // TWELEVE_EROUNDS
3635    for ( int i = 0;  i <= 40; i += 8 ) {
3636      __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3637      __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3638      if (i != 40 ) {
3639        __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3640        __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3641      } else {
3642        __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3643        __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3644      }
3645    }
3646
3647    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3648    __ andcc(to, 7, L1);
3649    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit);
3650    __ delayed()->edge8n(to, G0, L2);
3651
3652    // aligned case: store output into the destination array
3653    __ stf(FloatRegisterImpl::D, F60, to, 0);
3654    __ stf(FloatRegisterImpl::D, F62, to, 8);
3655    __ ba_short(L_check_loop_end_192bit);
3656
3657    __ BIND(L_store_misaligned_output_192bit);
3658    __ add(to, 8, L3);
3659    __ mov(8, L4);
3660    __ sub(L4, L1, L4);
3661    __ alignaddr(L4, G0, L4);
3662    __ movdtox(F60, L6);
3663    __ movdtox(F62, L7);
3664    __ faligndata(F60, F60, F60);
3665    __ faligndata(F62, F62, F62);
3666    __ mov(to, L5);
3667    __ and3(to, -8, to);
3668    __ and3(L3, -8, L3);
3669    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3670    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3671    __ add(to, 8, to);
3672    __ add(L3, 8, L3);
3673    __ orn(G0, L2, L2);
3674    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3675    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3676    __ mov(L5, to);
3677    __ movxtod(L6, F60);
3678    __ movxtod(L7, F62);
3679
3680    __ BIND(L_check_loop_end_192bit);
3681    __ add(from, 16, from);
3682    __ subcc(len_reg, 16, len_reg);
3683    __ add(to, 16, to);
3684    __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192);
3685    __ delayed()->nop();
3686    // re-init intial vector for next block, 8-byte alignment is guaranteed
3687    __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3688    __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3689    __ mov(L0, I0);
3690    __ ret();
3691    __ delayed()->restore();
3692
3693    __ align(OptoLoopAlignment);
3694    __ BIND(L_cbcenc256);
3695    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3696    __ andcc(from, 7, G0);
3697    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit);
3698    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3699
3700    // aligned case: load input into G3 and G4
3701    __ ldx(from,0,G3);
3702    __ ldx(from,8,G4);
3703    __ ba_short(L_256bit_transform);
3704
3705    __ BIND(L_load_misaligned_input_256bit);
3706    // cannot clobber F48, F50 and F52. F56, F58 can be used though
3707    __ alignaddr(from, G0, from);
3708    __ movdtox(F60, L2); // save F60 before overwriting
3709    __ ldf(FloatRegisterImpl::D, from, 0, F56);
3710    __ ldf(FloatRegisterImpl::D, from, 8, F58);
3711    __ ldf(FloatRegisterImpl::D, from, 16, F60);
3712    __ faligndata(F56, F58, F56);
3713    __ faligndata(F58, F60, F58);
3714    __ movdtox(F56, G3);
3715    __ movdtox(F58, G4);
3716    __ mov(L1, from);
3717    __ movxtod(L2, F60);
3718
3719    __ BIND(L_256bit_transform);
3720    __ xor3(G1,G3,G3);
3721    __ xor3(G5,G4,G4);
3722    __ movxtod(G3,F56);
3723    __ movxtod(G4,F58);
3724    __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3725    __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3726
3727    // FOURTEEN_EROUNDS
3728    for ( int i = 0;  i <= 48; i += 8 ) {
3729      __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3730      __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3731      if (i != 48 ) {
3732        __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3733        __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3734      } else {
3735        __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3736        __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3737      }
3738    }
3739
3740    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3741    __ andcc(to, 7, L1);
3742    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit);
3743    __ delayed()->edge8n(to, G0, L2);
3744
3745    // aligned case: store output into the destination array
3746    __ stf(FloatRegisterImpl::D, F60, to, 0);
3747    __ stf(FloatRegisterImpl::D, F62, to, 8);
3748    __ ba_short(L_check_loop_end_256bit);
3749
3750    __ BIND(L_store_misaligned_output_256bit);
3751    __ add(to, 8, L3);
3752    __ mov(8, L4);
3753    __ sub(L4, L1, L4);
3754    __ alignaddr(L4, G0, L4);
3755    __ movdtox(F60, L6);
3756    __ movdtox(F62, L7);
3757    __ faligndata(F60, F60, F60);
3758    __ faligndata(F62, F62, F62);
3759    __ mov(to, L5);
3760    __ and3(to, -8, to);
3761    __ and3(L3, -8, L3);
3762    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3763    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3764    __ add(to, 8, to);
3765    __ add(L3, 8, L3);
3766    __ orn(G0, L2, L2);
3767    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3768    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3769    __ mov(L5, to);
3770    __ movxtod(L6, F60);
3771    __ movxtod(L7, F62);
3772
3773    __ BIND(L_check_loop_end_256bit);
3774    __ add(from, 16, from);
3775    __ subcc(len_reg, 16, len_reg);
3776    __ add(to, 16, to);
3777    __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256);
3778    __ delayed()->nop();
3779    // re-init intial vector for next block, 8-byte alignment is guaranteed
3780    __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3781    __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3782    __ mov(L0, I0);
3783    __ ret();
3784    __ delayed()->restore();
3785
3786    return start;
3787  }
3788
3789  address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3790    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3791           "the following code assumes that first element of an int array is aligned to 8 bytes");
3792    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3793           "the following code assumes that first element of a byte array is aligned to 8 bytes");
3794    __ align(CodeEntryAlignment);
3795    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3796    Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start;
3797    Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256;
3798    Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128;
3799    Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256;
3800    Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128;
3801    Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192;
3802    Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256;
3803    address start = __ pc();
3804    Register from = I0; // source byte array
3805    Register to = I1;   // destination byte array
3806    Register key = I2;  // expanded key array
3807    Register rvec = I3; // init vector
3808    const Register len_reg = I4; // cipher length
3809    const Register original_key = I5;  // original key array only required during decryption
3810    const Register keylen = L6;  // reg for storing expanded key array length
3811
3812    __ save_frame(0); //args are read from I* registers since we save the frame in the beginning
3813    // save cipher len to return in the end
3814    __ mov(len_reg, L7);
3815
3816    // load original key from SunJCE expanded decryption key
3817    // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
3818    for ( int i = 0;  i <= 3; i++ ) {
3819      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3820    }
3821
3822    // load initial vector, 8-byte alignment is guaranteed
3823    __ ldx(rvec,0,L0);
3824    __ ldx(rvec,8,L1);
3825
3826    // read expanded key array length
3827    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3828
3829    // 256-bit original key size
3830    __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
3831
3832    // 192-bit original key size
3833    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
3834
3835    // 128-bit original key size
3836    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3837    for ( int i = 0;  i <= 36; i += 4 ) {
3838      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
3839      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
3840    }
3841
3842    // load expanded key[last-1] and key[last] elements
3843    __ movdtox(F40,L2);
3844    __ movdtox(F42,L3);
3845
3846    __ and3(len_reg, 16, L4);
3847    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128);
3848    __ nop();
3849
3850    __ ba_short(L_dec_first_block_start);
3851
3852    __ BIND(L_expand192bit);
3853    // load rest of the 192-bit key
3854    __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
3855    __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
3856
3857    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3858    for ( int i = 0;  i <= 36; i += 6 ) {
3859      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
3860      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
3861      __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
3862    }
3863    __ aes_kexpand1(F42, F46, 7, F48);
3864    __ aes_kexpand2(F44, F48, F50);
3865
3866    // load expanded key[last-1] and key[last] elements
3867    __ movdtox(F48,L2);
3868    __ movdtox(F50,L3);
3869
3870    __ and3(len_reg, 16, L4);
3871    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192);
3872    __ nop();
3873
3874    __ ba_short(L_dec_first_block_start);
3875
3876    __ BIND(L_expand256bit);
3877    // load rest of the 256-bit key
3878    for ( int i = 4;  i <= 7; i++ ) {
3879      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3880    }
3881
3882    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3883    for ( int i = 0;  i <= 40; i += 8 ) {
3884      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
3885      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
3886      __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
3887      __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
3888    }
3889    __ aes_kexpand1(F48, F54, 6, F56);
3890    __ aes_kexpand2(F50, F56, F58);
3891
3892    // load expanded key[last-1] and key[last] elements
3893    __ movdtox(F56,L2);
3894    __ movdtox(F58,L3);
3895
3896    __ and3(len_reg, 16, L4);
3897    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256);
3898
3899    __ BIND(L_dec_first_block_start);
3900    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3901    __ andcc(from, 7, G0);
3902    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block);
3903    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
3904
3905    // aligned case: load input into L4 and L5
3906    __ ldx(from,0,L4);
3907    __ ldx(from,8,L5);
3908    __ ba_short(L_transform_first_block);
3909
3910    __ BIND(L_load_misaligned_input_first_block);
3911    __ alignaddr(from, G0, from);
3912    // F58, F60, F62 can be clobbered
3913    __ ldf(FloatRegisterImpl::D, from, 0, F58);
3914    __ ldf(FloatRegisterImpl::D, from, 8, F60);
3915    __ ldf(FloatRegisterImpl::D, from, 16, F62);
3916    __ faligndata(F58, F60, F58);
3917    __ faligndata(F60, F62, F60);
3918    __ movdtox(F58, L4);
3919    __ movdtox(F60, L5);
3920    __ mov(G1, from);
3921
3922    __ BIND(L_transform_first_block);
3923    __ xor3(L2,L4,G1);
3924    __ movxtod(G1,F60);
3925    __ xor3(L3,L5,G1);
3926    __ movxtod(G1,F62);
3927
3928    // 128-bit original key size
3929    __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128);
3930
3931    // 192-bit original key size
3932    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192);
3933
3934    __ aes_dround23(F54, F60, F62, F58);
3935    __ aes_dround01(F52, F60, F62, F56);
3936    __ aes_dround23(F50, F56, F58, F62);
3937    __ aes_dround01(F48, F56, F58, F60);
3938
3939    __ BIND(L_dec_first_block192);
3940    __ aes_dround23(F46, F60, F62, F58);
3941    __ aes_dround01(F44, F60, F62, F56);
3942    __ aes_dround23(F42, F56, F58, F62);
3943    __ aes_dround01(F40, F56, F58, F60);
3944
3945    __ BIND(L_dec_first_block128);
3946    for ( int i = 38;  i >= 6; i -= 8 ) {
3947      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
3948      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
3949      if ( i != 6) {
3950        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
3951        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
3952      } else {
3953        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
3954        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
3955      }
3956    }
3957
3958    __ movxtod(L0,F56);
3959    __ movxtod(L1,F58);
3960    __ mov(L4,L0);
3961    __ mov(L5,L1);
3962    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
3963    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
3964
3965    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3966    __ andcc(to, 7, G1);
3967    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block);
3968    __ delayed()->edge8n(to, G0, G2);
3969
3970    // aligned case: store output into the destination array
3971    __ stf(FloatRegisterImpl::D, F60, to, 0);
3972    __ stf(FloatRegisterImpl::D, F62, to, 8);
3973    __ ba_short(L_check_decrypt_end);
3974
3975    __ BIND(L_store_misaligned_output_first_block);
3976    __ add(to, 8, G3);
3977    __ mov(8, G4);
3978    __ sub(G4, G1, G4);
3979    __ alignaddr(G4, G0, G4);
3980    __ faligndata(F60, F60, F60);
3981    __ faligndata(F62, F62, F62);
3982    __ mov(to, G1);
3983    __ and3(to, -8, to);
3984    __ and3(G3, -8, G3);
3985    __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
3986    __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
3987    __ add(to, 8, to);
3988    __ add(G3, 8, G3);
3989    __ orn(G0, G2, G2);
3990    __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
3991    __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
3992    __ mov(G1, to);
3993
3994    __ BIND(L_check_decrypt_end);
3995    __ add(from, 16, from);
3996    __ add(to, 16, to);
3997    __ subcc(len_reg, 16, len_reg);
3998    __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end);
3999    __ delayed()->nop();
4000
4001    // 256-bit original key size
4002    __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256);
4003
4004    // 192-bit original key size
4005    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192);
4006
4007    __ align(OptoLoopAlignment);
4008    __ BIND(L_dec_next2_blocks128);
4009    __ nop();
4010
4011    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4012    __ andcc(from, 7, G0);
4013    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128);
4014    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4015
4016    // aligned case: load input into G4, G5, L4 and L5
4017    __ ldx(from,0,G4);
4018    __ ldx(from,8,G5);
4019    __ ldx(from,16,L4);
4020    __ ldx(from,24,L5);
4021    __ ba_short(L_transform_next2_blocks128);
4022
4023    __ BIND(L_load_misaligned_next2_blocks128);
4024    __ alignaddr(from, G0, from);
4025    // F40, F42, F58, F60, F62 can be clobbered
4026    __ ldf(FloatRegisterImpl::D, from, 0, F40);
4027    __ ldf(FloatRegisterImpl::D, from, 8, F42);
4028    __ ldf(FloatRegisterImpl::D, from, 16, F60);
4029    __ ldf(FloatRegisterImpl::D, from, 24, F62);
4030    __ ldf(FloatRegisterImpl::D, from, 32, F58);
4031    __ faligndata(F40, F42, F40);
4032    __ faligndata(F42, F60, F42);
4033    __ faligndata(F60, F62, F60);
4034    __ faligndata(F62, F58, F62);
4035    __ movdtox(F40, G4);
4036    __ movdtox(F42, G5);
4037    __ movdtox(F60, L4);
4038    __ movdtox(F62, L5);
4039    __ mov(G1, from);
4040
4041    __ BIND(L_transform_next2_blocks128);
4042    // F40:F42 used for first 16-bytes
4043    __ xor3(L2,G4,G1);
4044    __ movxtod(G1,F40);
4045    __ xor3(L3,G5,G1);
4046    __ movxtod(G1,F42);
4047
4048    // F60:F62 used for next 16-bytes
4049    __ xor3(L2,L4,G1);
4050    __ movxtod(G1,F60);
4051    __ xor3(L3,L5,G1);
4052    __ movxtod(G1,F62);
4053
4054    for ( int i = 38;  i >= 6; i -= 8 ) {
4055      __ aes_dround23(as_FloatRegister(i), F40, F42, F44);
4056      __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46);
4057      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4058      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4059      if (i != 6 ) {
4060        __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42);
4061        __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40);
4062        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4063        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4064      } else {
4065        __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42);
4066        __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40);
4067        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
4068        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
4069      }
4070    }
4071
4072    __ movxtod(L0,F46);
4073    __ movxtod(L1,F44);
4074    __ fxor(FloatRegisterImpl::D, F46, F40, F40);
4075    __ fxor(FloatRegisterImpl::D, F44, F42, F42);
4076
4077    __ movxtod(G4,F56);
4078    __ movxtod(G5,F58);
4079    __ mov(L4,L0);
4080    __ mov(L5,L1);
4081    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4082    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4083
4084    // For mis-aligned store of 32 bytes of result we can do:
4085    // Circular right-shift all 4 FP registers so that 'head' and 'tail'
4086    // parts that need to be stored starting at mis-aligned address are in a FP reg
4087    // the other 3 FP regs can thus be stored using regular store
4088    // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts
4089
4090    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4091    __ andcc(to, 7, G1);
4092    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128);
4093    __ delayed()->edge8n(to, G0, G2);
4094
4095    // aligned case: store output into the destination array
4096    __ stf(FloatRegisterImpl::D, F40, to, 0);
4097    __ stf(FloatRegisterImpl::D, F42, to, 8);
4098    __ stf(FloatRegisterImpl::D, F60, to, 16);
4099    __ stf(FloatRegisterImpl::D, F62, to, 24);
4100    __ ba_short(L_check_decrypt_loop_end128);
4101
4102    __ BIND(L_store_misaligned_output_next2_blocks128);
4103    __ mov(8, G4);
4104    __ sub(G4, G1, G4);
4105    __ alignaddr(G4, G0, G4);
4106    __ faligndata(F40, F42, F56); // F56 can be clobbered
4107    __ faligndata(F42, F60, F42);
4108    __ faligndata(F60, F62, F60);
4109    __ faligndata(F62, F40, F40);
4110    __ mov(to, G1);
4111    __ and3(to, -8, to);
4112    __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
4113    __ stf(FloatRegisterImpl::D, F56, to, 8);
4114    __ stf(FloatRegisterImpl::D, F42, to, 16);
4115    __ stf(FloatRegisterImpl::D, F60, to, 24);
4116    __ add(to, 32, to);
4117    __ orn(G0, G2, G2);
4118    __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
4119    __ mov(G1, to);
4120
4121    __ BIND(L_check_decrypt_loop_end128);
4122    __ add(from, 32, from);
4123    __ add(to, 32, to);
4124    __ subcc(len_reg, 32, len_reg);
4125    __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128);
4126    __ delayed()->nop();
4127    __ ba_short(L_cbcdec_end);
4128
4129    __ align(OptoLoopAlignment);
4130    __ BIND(L_dec_next2_blocks192);
4131    __ nop();
4132
4133    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4134    __ andcc(from, 7, G0);
4135    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192);
4136    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4137
4138    // aligned case: load input into G4, G5, L4 and L5
4139    __ ldx(from,0,G4);
4140    __ ldx(from,8,G5);
4141    __ ldx(from,16,L4);
4142    __ ldx(from,24,L5);
4143    __ ba_short(L_transform_next2_blocks192);
4144
4145    __ BIND(L_load_misaligned_next2_blocks192);
4146    __ alignaddr(from, G0, from);
4147    // F48, F50, F52, F60, F62 can be clobbered
4148    __ ldf(FloatRegisterImpl::D, from, 0, F48);
4149    __ ldf(FloatRegisterImpl::D, from, 8, F50);
4150    __ ldf(FloatRegisterImpl::D, from, 16, F60);
4151    __ ldf(FloatRegisterImpl::D, from, 24, F62);
4152    __ ldf(FloatRegisterImpl::D, from, 32, F52);
4153    __ faligndata(F48, F50, F48);
4154    __ faligndata(F50, F60, F50);
4155    __ faligndata(F60, F62, F60);
4156    __ faligndata(F62, F52, F62);
4157    __ movdtox(F48, G4);
4158    __ movdtox(F50, G5);
4159    __ movdtox(F60, L4);
4160    __ movdtox(F62, L5);
4161    __ mov(G1, from);
4162
4163    __ BIND(L_transform_next2_blocks192);
4164    // F48:F50 used for first 16-bytes
4165    __ xor3(L2,G4,G1);
4166    __ movxtod(G1,F48);
4167    __ xor3(L3,G5,G1);
4168    __ movxtod(G1,F50);
4169
4170    // F60:F62 used for next 16-bytes
4171    __ xor3(L2,L4,G1);
4172    __ movxtod(G1,F60);
4173    __ xor3(L3,L5,G1);
4174    __ movxtod(G1,F62);
4175
4176    for ( int i = 46;  i >= 6; i -= 8 ) {
4177      __ aes_dround23(as_FloatRegister(i), F48, F50, F52);
4178      __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54);
4179      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4180      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4181      if (i != 6 ) {
4182        __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50);
4183        __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48);
4184        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4185        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4186      } else {
4187        __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50);
4188        __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48);
4189        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
4190        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
4191      }
4192    }
4193
4194    __ movxtod(L0,F54);
4195    __ movxtod(L1,F52);
4196    __ fxor(FloatRegisterImpl::D, F54, F48, F48);
4197    __ fxor(FloatRegisterImpl::D, F52, F50, F50);
4198
4199    __ movxtod(G4,F56);
4200    __ movxtod(G5,F58);
4201    __ mov(L4,L0);
4202    __ mov(L5,L1);
4203    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4204    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4205
4206    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4207    __ andcc(to, 7, G1);
4208    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192);
4209    __ delayed()->edge8n(to, G0, G2);
4210
4211    // aligned case: store output into the destination array
4212    __ stf(FloatRegisterImpl::D, F48, to, 0);
4213    __ stf(FloatRegisterImpl::D, F50, to, 8);
4214    __ stf(FloatRegisterImpl::D, F60, to, 16);
4215    __ stf(FloatRegisterImpl::D, F62, to, 24);
4216    __ ba_short(L_check_decrypt_loop_end192);
4217
4218    __ BIND(L_store_misaligned_output_next2_blocks192);
4219    __ mov(8, G4);
4220    __ sub(G4, G1, G4);
4221    __ alignaddr(G4, G0, G4);
4222    __ faligndata(F48, F50, F56); // F56 can be clobbered
4223    __ faligndata(F50, F60, F50);
4224    __ faligndata(F60, F62, F60);
4225    __ faligndata(F62, F48, F48);
4226    __ mov(to, G1);
4227    __ and3(to, -8, to);
4228    __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
4229    __ stf(FloatRegisterImpl::D, F56, to, 8);
4230    __ stf(FloatRegisterImpl::D, F50, to, 16);
4231    __ stf(FloatRegisterImpl::D, F60, to, 24);
4232    __ add(to, 32, to);
4233    __ orn(G0, G2, G2);
4234    __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
4235    __ mov(G1, to);
4236
4237    __ BIND(L_check_decrypt_loop_end192);
4238    __ add(from, 32, from);
4239    __ add(to, 32, to);
4240    __ subcc(len_reg, 32, len_reg);
4241    __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192);
4242    __ delayed()->nop();
4243    __ ba_short(L_cbcdec_end);
4244
4245    __ align(OptoLoopAlignment);
4246    __ BIND(L_dec_next2_blocks256);
4247    __ nop();
4248
4249    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4250    __ andcc(from, 7, G0);
4251    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256);
4252    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4253
4254    // aligned case: load input into G4, G5, L4 and L5
4255    __ ldx(from,0,G4);
4256    __ ldx(from,8,G5);
4257    __ ldx(from,16,L4);
4258    __ ldx(from,24,L5);
4259    __ ba_short(L_transform_next2_blocks256);
4260
4261    __ BIND(L_load_misaligned_next2_blocks256);
4262    __ alignaddr(from, G0, from);
4263    // F0, F2, F4, F60, F62 can be clobbered
4264    __ ldf(FloatRegisterImpl::D, from, 0, F0);
4265    __ ldf(FloatRegisterImpl::D, from, 8, F2);
4266    __ ldf(FloatRegisterImpl::D, from, 16, F60);
4267    __ ldf(FloatRegisterImpl::D, from, 24, F62);
4268    __ ldf(FloatRegisterImpl::D, from, 32, F4);
4269    __ faligndata(F0, F2, F0);
4270    __ faligndata(F2, F60, F2);
4271    __ faligndata(F60, F62, F60);
4272    __ faligndata(F62, F4, F62);
4273    __ movdtox(F0, G4);
4274    __ movdtox(F2, G5);
4275    __ movdtox(F60, L4);
4276    __ movdtox(F62, L5);
4277    __ mov(G1, from);
4278
4279    __ BIND(L_transform_next2_blocks256);
4280    // F0:F2 used for first 16-bytes
4281    __ xor3(L2,G4,G1);
4282    __ movxtod(G1,F0);
4283    __ xor3(L3,G5,G1);
4284    __ movxtod(G1,F2);
4285
4286    // F60:F62 used for next 16-bytes
4287    __ xor3(L2,L4,G1);
4288    __ movxtod(G1,F60);
4289    __ xor3(L3,L5,G1);
4290    __ movxtod(G1,F62);
4291
4292    __ aes_dround23(F54, F0, F2, F4);
4293    __ aes_dround01(F52, F0, F2, F6);
4294    __ aes_dround23(F54, F60, F62, F58);
4295    __ aes_dround01(F52, F60, F62, F56);
4296    __ aes_dround23(F50, F6, F4, F2);
4297    __ aes_dround01(F48, F6, F4, F0);
4298    __ aes_dround23(F50, F56, F58, F62);
4299    __ aes_dround01(F48, F56, F58, F60);
4300    // save F48:F54 in temp registers
4301    __ movdtox(F54,G2);
4302    __ movdtox(F52,G3);
4303    __ movdtox(F50,G6);
4304    __ movdtox(F48,G1);
4305    for ( int i = 46;  i >= 14; i -= 8 ) {
4306      __ aes_dround23(as_FloatRegister(i), F0, F2, F4);
4307      __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6);
4308      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4309      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4310      __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2);
4311      __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0);
4312      __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4313      __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4314    }
4315    // init F48:F54 with F0:F6 values (original key)
4316    __ ldf(FloatRegisterImpl::D, original_key, 0, F48);
4317    __ ldf(FloatRegisterImpl::D, original_key, 8, F50);
4318    __ ldf(FloatRegisterImpl::D, original_key, 16, F52);
4319    __ ldf(FloatRegisterImpl::D, original_key, 24, F54);
4320    __ aes_dround23(F54, F0, F2, F4);
4321    __ aes_dround01(F52, F0, F2, F6);
4322    __ aes_dround23(F54, F60, F62, F58);
4323    __ aes_dround01(F52, F60, F62, F56);
4324    __ aes_dround23_l(F50, F6, F4, F2);
4325    __ aes_dround01_l(F48, F6, F4, F0);
4326    __ aes_dround23_l(F50, F56, F58, F62);
4327    __ aes_dround01_l(F48, F56, F58, F60);
4328    // re-init F48:F54 with their original values
4329    __ movxtod(G2,F54);
4330    __ movxtod(G3,F52);
4331    __ movxtod(G6,F50);
4332    __ movxtod(G1,F48);
4333
4334    __ movxtod(L0,F6);
4335    __ movxtod(L1,F4);
4336    __ fxor(FloatRegisterImpl::D, F6, F0, F0);
4337    __ fxor(FloatRegisterImpl::D, F4, F2, F2);
4338
4339    __ movxtod(G4,F56);
4340    __ movxtod(G5,F58);
4341    __ mov(L4,L0);
4342    __ mov(L5,L1);
4343    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4344    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4345
4346    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4347    __ andcc(to, 7, G1);
4348    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256);
4349    __ delayed()->edge8n(to, G0, G2);
4350
4351    // aligned case: store output into the destination array
4352    __ stf(FloatRegisterImpl::D, F0, to, 0);
4353    __ stf(FloatRegisterImpl::D, F2, to, 8);
4354    __ stf(FloatRegisterImpl::D, F60, to, 16);
4355    __ stf(FloatRegisterImpl::D, F62, to, 24);
4356    __ ba_short(L_check_decrypt_loop_end256);
4357
4358    __ BIND(L_store_misaligned_output_next2_blocks256);
4359    __ mov(8, G4);
4360    __ sub(G4, G1, G4);
4361    __ alignaddr(G4, G0, G4);
4362    __ faligndata(F0, F2, F56); // F56 can be clobbered
4363    __ faligndata(F2, F60, F2);
4364    __ faligndata(F60, F62, F60);
4365    __ faligndata(F62, F0, F0);
4366    __ mov(to, G1);
4367    __ and3(to, -8, to);
4368    __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
4369    __ stf(FloatRegisterImpl::D, F56, to, 8);
4370    __ stf(FloatRegisterImpl::D, F2, to, 16);
4371    __ stf(FloatRegisterImpl::D, F60, to, 24);
4372    __ add(to, 32, to);
4373    __ orn(G0, G2, G2);
4374    __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
4375    __ mov(G1, to);
4376
4377    __ BIND(L_check_decrypt_loop_end256);
4378    __ add(from, 32, from);
4379    __ add(to, 32, to);
4380    __ subcc(len_reg, 32, len_reg);
4381    __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256);
4382    __ delayed()->nop();
4383
4384    __ BIND(L_cbcdec_end);
4385    // re-init intial vector for next block, 8-byte alignment is guaranteed
4386    __ stx(L0, rvec, 0);
4387    __ stx(L1, rvec, 8);
4388    __ mov(L7, I0);
4389    __ ret();
4390    __ delayed()->restore();
4391
4392    return start;
4393  }
4394
4395  address generate_sha1_implCompress(bool multi_block, const char *name) {
4396    __ align(CodeEntryAlignment);
4397    StubCodeMark mark(this, "StubRoutines", name);
4398    address start = __ pc();
4399
4400    Label L_sha1_loop, L_sha1_unaligned_input, L_sha1_unaligned_input_loop;
4401    int i;
4402
4403    Register buf   = O0; // byte[] source+offset
4404    Register state = O1; // int[]  SHA.state
4405    Register ofs   = O2; // int    offset
4406    Register limit = O3; // int    limit
4407
4408    // load state into F0-F4
4409    for (i = 0; i < 5; i++) {
4410      __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i));
4411    }
4412
4413    __ andcc(buf, 7, G0);
4414    __ br(Assembler::notZero, false, Assembler::pn, L_sha1_unaligned_input);
4415    __ delayed()->nop();
4416
4417    __ BIND(L_sha1_loop);
4418    // load buf into F8-F22
4419    for (i = 0; i < 8; i++) {
4420      __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4421    }
4422    __ sha1();
4423    if (multi_block) {
4424      __ add(ofs, 64, ofs);
4425      __ add(buf, 64, buf);
4426      __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_loop);
4427      __ mov(ofs, O0); // to be returned
4428    }
4429
4430    // store F0-F4 into state and return
4431    for (i = 0; i < 4; i++) {
4432      __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4433    }
4434    __ retl();
4435    __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10);
4436
4437    __ BIND(L_sha1_unaligned_input);
4438    __ alignaddr(buf, G0, buf);
4439
4440    __ BIND(L_sha1_unaligned_input_loop);
4441    // load buf into F8-F22
4442    for (i = 0; i < 9; i++) {
4443      __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4444    }
4445    for (i = 0; i < 8; i++) {
4446      __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8));
4447    }
4448    __ sha1();
4449    if (multi_block) {
4450      __ add(ofs, 64, ofs);
4451      __ add(buf, 64, buf);
4452      __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_unaligned_input_loop);
4453      __ mov(ofs, O0); // to be returned
4454    }
4455
4456    // store F0-F4 into state and return
4457    for (i = 0; i < 4; i++) {
4458      __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4459    }
4460    __ retl();
4461    __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10);
4462
4463    return start;
4464  }
4465
4466  address generate_sha256_implCompress(bool multi_block, const char *name) {
4467    __ align(CodeEntryAlignment);
4468    StubCodeMark mark(this, "StubRoutines", name);
4469    address start = __ pc();
4470
4471    Label L_sha256_loop, L_sha256_unaligned_input, L_sha256_unaligned_input_loop;
4472    int i;
4473
4474    Register buf   = O0; // byte[] source+offset
4475    Register state = O1; // int[]  SHA2.state
4476    Register ofs   = O2; // int    offset
4477    Register limit = O3; // int    limit
4478
4479    // load state into F0-F7
4480    for (i = 0; i < 8; i++) {
4481      __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i));
4482    }
4483
4484    __ andcc(buf, 7, G0);
4485    __ br(Assembler::notZero, false, Assembler::pn, L_sha256_unaligned_input);
4486    __ delayed()->nop();
4487
4488    __ BIND(L_sha256_loop);
4489    // load buf into F8-F22
4490    for (i = 0; i < 8; i++) {
4491      __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4492    }
4493    __ sha256();
4494    if (multi_block) {
4495      __ add(ofs, 64, ofs);
4496      __ add(buf, 64, buf);
4497      __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_loop);
4498      __ mov(ofs, O0); // to be returned
4499    }
4500
4501    // store F0-F7 into state and return
4502    for (i = 0; i < 7; i++) {
4503      __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4504    }
4505    __ retl();
4506    __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c);
4507
4508    __ BIND(L_sha256_unaligned_input);
4509    __ alignaddr(buf, G0, buf);
4510
4511    __ BIND(L_sha256_unaligned_input_loop);
4512    // load buf into F8-F22
4513    for (i = 0; i < 9; i++) {
4514      __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4515    }
4516    for (i = 0; i < 8; i++) {
4517      __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8));
4518    }
4519    __ sha256();
4520    if (multi_block) {
4521      __ add(ofs, 64, ofs);
4522      __ add(buf, 64, buf);
4523      __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_unaligned_input_loop);
4524      __ mov(ofs, O0); // to be returned
4525    }
4526
4527    // store F0-F7 into state and return
4528    for (i = 0; i < 7; i++) {
4529      __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4530    }
4531    __ retl();
4532    __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c);
4533
4534    return start;
4535  }
4536
4537  address generate_sha512_implCompress(bool multi_block, const char *name) {
4538    __ align(CodeEntryAlignment);
4539    StubCodeMark mark(this, "StubRoutines", name);
4540    address start = __ pc();
4541
4542    Label L_sha512_loop, L_sha512_unaligned_input, L_sha512_unaligned_input_loop;
4543    int i;
4544
4545    Register buf   = O0; // byte[] source+offset
4546    Register state = O1; // long[] SHA5.state
4547    Register ofs   = O2; // int    offset
4548    Register limit = O3; // int    limit
4549
4550    // load state into F0-F14
4551    for (i = 0; i < 8; i++) {
4552      __ ldf(FloatRegisterImpl::D, state, i*8, as_FloatRegister(i*2));
4553    }
4554
4555    __ andcc(buf, 7, G0);
4556    __ br(Assembler::notZero, false, Assembler::pn, L_sha512_unaligned_input);
4557    __ delayed()->nop();
4558
4559    __ BIND(L_sha512_loop);
4560    // load buf into F16-F46
4561    for (i = 0; i < 16; i++) {
4562      __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16));
4563    }
4564    __ sha512();
4565    if (multi_block) {
4566      __ add(ofs, 128, ofs);
4567      __ add(buf, 128, buf);
4568      __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_loop);
4569      __ mov(ofs, O0); // to be returned
4570    }
4571
4572    // store F0-F14 into state and return
4573    for (i = 0; i < 7; i++) {
4574      __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8);
4575    }
4576    __ retl();
4577    __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38);
4578
4579    __ BIND(L_sha512_unaligned_input);
4580    __ alignaddr(buf, G0, buf);
4581
4582    __ BIND(L_sha512_unaligned_input_loop);
4583    // load buf into F16-F46
4584    for (i = 0; i < 17; i++) {
4585      __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16));
4586    }
4587    for (i = 0; i < 16; i++) {
4588      __ faligndata(as_FloatRegister(i*2 + 16), as_FloatRegister(i*2 + 18), as_FloatRegister(i*2 + 16));
4589    }
4590    __ sha512();
4591    if (multi_block) {
4592      __ add(ofs, 128, ofs);
4593      __ add(buf, 128, buf);
4594      __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_unaligned_input_loop);
4595      __ mov(ofs, O0); // to be returned
4596    }
4597
4598    // store F0-F14 into state and return
4599    for (i = 0; i < 7; i++) {
4600      __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8);
4601    }
4602    __ retl();
4603    __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38);
4604
4605    return start;
4606  }
4607
4608  /* Single and multi-block ghash operations */
4609  address generate_ghash_processBlocks() {
4610      __ align(CodeEntryAlignment);
4611      Label L_ghash_loop, L_aligned, L_main;
4612      StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4613      address start = __ pc();
4614
4615      Register state = I0;
4616      Register subkeyH = I1;
4617      Register data = I2;
4618      Register len = I3;
4619
4620      __ save_frame(0);
4621
4622      __ ldx(state, 0, O0);
4623      __ ldx(state, 8, O1);
4624
4625      // Loop label for multiblock operations
4626      __ BIND(L_ghash_loop);
4627
4628      // Check if 'data' is unaligned
4629      __ andcc(data, 7, G1);
4630      __ br(Assembler::zero, false, Assembler::pt, L_aligned);
4631      __ delayed()->nop();
4632
4633      Register left_shift = L1;
4634      Register right_shift = L2;
4635      Register data_ptr = L3;
4636
4637      // Get left and right shift values in bits
4638      __ sll(G1, LogBitsPerByte, left_shift);
4639      __ mov(64, right_shift);
4640      __ sub(right_shift, left_shift, right_shift);
4641
4642      // Align to read 'data'
4643      __ sub(data, G1, data_ptr);
4644
4645      // Load first 8 bytes of 'data'
4646      __ ldx(data_ptr, 0, O4);
4647      __ sllx(O4, left_shift, O4);
4648      __ ldx(data_ptr, 8, O5);
4649      __ srlx(O5, right_shift, G4);
4650      __ bset(G4, O4);
4651
4652      // Load second 8 bytes of 'data'
4653      __ sllx(O5, left_shift, O5);
4654      __ ldx(data_ptr, 16, G4);
4655      __ srlx(G4, right_shift, G4);
4656      __ ba(L_main);
4657      __ delayed()->bset(G4, O5);
4658
4659      // If 'data' is aligned, load normally
4660      __ BIND(L_aligned);
4661      __ ldx(data, 0, O4);
4662      __ ldx(data, 8, O5);
4663
4664      __ BIND(L_main);
4665      __ ldx(subkeyH, 0, O2);
4666      __ ldx(subkeyH, 8, O3);
4667
4668      __ xor3(O0, O4, O0);
4669      __ xor3(O1, O5, O1);
4670
4671      __ xmulxhi(O0, O3, G3);
4672      __ xmulx(O0, O2, O5);
4673      __ xmulxhi(O1, O2, G4);
4674      __ xmulxhi(O1, O3, G5);
4675      __ xmulx(O0, O3, G1);
4676      __ xmulx(O1, O3, G2);
4677      __ xmulx(O1, O2, O3);
4678      __ xmulxhi(O0, O2, O4);
4679
4680      __ mov(0xE1, O0);
4681      __ sllx(O0, 56, O0);
4682
4683      __ xor3(O5, G3, O5);
4684      __ xor3(O5, G4, O5);
4685      __ xor3(G5, G1, G1);
4686      __ xor3(G1, O3, G1);
4687      __ srlx(G2, 63, O1);
4688      __ srlx(G1, 63, G3);
4689      __ sllx(G2, 63, O3);
4690      __ sllx(G2, 58, O2);
4691      __ xor3(O3, O2, O2);
4692
4693      __ sllx(G1, 1, G1);
4694      __ or3(G1, O1, G1);
4695
4696      __ xor3(G1, O2, G1);
4697
4698      __ sllx(G2, 1, G2);
4699
4700      __ xmulxhi(G1, O0, O1);
4701      __ xmulx(G1, O0, O2);
4702      __ xmulxhi(G2, O0, O3);
4703      __ xmulx(G2, O0, G1);
4704
4705      __ xor3(O4, O1, O4);
4706      __ xor3(O5, O2, O5);
4707      __ xor3(O5, O3, O5);
4708
4709      __ sllx(O4, 1, O2);
4710      __ srlx(O5, 63, O3);
4711
4712      __ or3(O2, O3, O0);
4713
4714      __ sllx(O5, 1, O1);
4715      __ srlx(G1, 63, O2);
4716      __ or3(O1, O2, O1);
4717      __ xor3(O1, G3, O1);
4718
4719      __ deccc(len);
4720      __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
4721      __ delayed()->add(data, 16, data);
4722
4723      __ stx(O0, I0, 0);
4724      __ stx(O1, I0, 8);
4725
4726      __ ret();
4727      __ delayed()->restore();
4728
4729      return start;
4730  }
4731
4732  /**
4733   *  Arguments:
4734   *
4735   * Inputs:
4736   *   O0   - int   crc
4737   *   O1   - byte* buf
4738   *   O2   - int   len
4739   *   O3   - int*  table
4740   *
4741   * Output:
4742   *   O0   - int crc result
4743   */
4744  address generate_updateBytesCRC32C() {
4745    assert(UseCRC32CIntrinsics, "need CRC32C instruction");
4746
4747    __ align(CodeEntryAlignment);
4748    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4749    address start = __ pc();
4750
4751    const Register crc   = O0;  // crc
4752    const Register buf   = O1;  // source java byte array address
4753    const Register len   = O2;  // number of bytes
4754    const Register table = O3;  // byteTable
4755
4756    __ kernel_crc32c(crc, buf, len, table);
4757
4758    __ retl();
4759    __ delayed()->nop();
4760
4761    return start;
4762  }
4763
4764#define ADLER32_NUM_TEMPS 16
4765
4766  /**
4767   *  Arguments:
4768   *
4769   * Inputs:
4770   *   O0   - int   adler
4771   *   O1   - byte* buff
4772   *   O2   - int   len
4773   *
4774   * Output:
4775   *   O0   - int adler result
4776   */
4777  address generate_updateBytesAdler32() {
4778    __ align(CodeEntryAlignment);
4779    StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4780    address start = __ pc();
4781
4782    Label L_cleanup_loop, L_cleanup_loop_check;
4783    Label L_main_loop_check, L_main_loop, L_inner_loop, L_inner_loop_check;
4784    Label L_nmax_check_done;
4785
4786    // Aliases
4787    Register s1     = O0;
4788    Register s2     = O3;
4789    Register buff   = O1;
4790    Register len    = O2;
4791    Register temp[ADLER32_NUM_TEMPS] = {L0, L1, L2, L3, L4, L5, L6, L7, I0, I1, I2, I3, I4, I5, G3, I7};
4792
4793    // Max number of bytes we can process before having to take the mod
4794    // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4795    unsigned long NMAX = 0x15B0;
4796
4797    // Zero-out the upper bits of len
4798    __ clruwu(len);
4799
4800    // Create the mask 0xFFFF
4801    __ set64(0x00FFFF, O4, O5); // O5 is the temp register
4802
4803    // s1 is initialized to the lower 16 bits of adler
4804    // s2 is initialized to the upper 16 bits of adler
4805    __ srlx(O0, 16, O5); // adler >> 16
4806    __ and3(O0, O4, s1); // s1  = (adler & 0xFFFF)
4807    __ and3(O5, O4, s2); // s2  = ((adler >> 16) & 0xFFFF)
4808
4809    // The pipelined loop needs at least 16 elements for 1 iteration
4810    // It does check this, but it is more effective to skip to the cleanup loop
4811    // Setup the constant for cutoff checking
4812    __ mov(15, O4);
4813
4814    // Check if we are above the cutoff, if not go to the cleanup loop immediately
4815    __ cmp_and_br_short(len, O4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_loop_check);
4816
4817    // Free up some registers for our use
4818    for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
4819      __ movxtod(temp[i], as_FloatRegister(2*i));
4820    }
4821
4822    // Loop maintenance stuff is done at the end of the loop, so skip to there
4823    __ ba_short(L_main_loop_check);
4824
4825    __ BIND(L_main_loop);
4826
4827    // Prologue for inner loop
4828    __ ldub(buff, 0, L0);
4829    __ dec(O5);
4830
4831    for (int i = 1; i < 8; i++) {
4832      __ ldub(buff, i, temp[i]);
4833    }
4834
4835    __ inc(buff, 8);
4836
4837    // Inner loop processes 16 elements at a time, might never execute if only 16 elements
4838    // to be processed by the outter loop
4839    __ ba_short(L_inner_loop_check);
4840
4841    __ BIND(L_inner_loop);
4842
4843    for (int i = 0; i < 8; i++) {
4844      __ ldub(buff, (2*i), temp[(8+(2*i)) % ADLER32_NUM_TEMPS]);
4845      __ add(s1, temp[i], s1);
4846      __ ldub(buff, (2*i)+1, temp[(8+(2*i)+1) % ADLER32_NUM_TEMPS]);
4847      __ add(s2, s1, s2);
4848    }
4849
4850    // Original temp 0-7 used and new loads to temp 0-7 issued
4851    // temp 8-15 ready to be consumed
4852    __ add(s1, I0, s1);
4853    __ dec(O5);
4854    __ add(s2, s1, s2);
4855    __ add(s1, I1, s1);
4856    __ inc(buff, 16);
4857    __ add(s2, s1, s2);
4858
4859    for (int i = 0; i < 6; i++) {
4860      __ add(s1, temp[10+i], s1);
4861      __ add(s2, s1, s2);
4862    }
4863
4864    __ BIND(L_inner_loop_check);
4865    __ nop();
4866    __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_inner_loop);
4867
4868    // Epilogue
4869    for (int i = 0; i < 4; i++) {
4870      __ ldub(buff, (2*i), temp[8+(2*i)]);
4871      __ add(s1, temp[i], s1);
4872      __ ldub(buff, (2*i)+1, temp[8+(2*i)+1]);
4873      __ add(s2, s1, s2);
4874    }
4875
4876    __ add(s1, temp[4], s1);
4877    __ inc(buff, 8);
4878
4879    for (int i = 0; i < 11; i++) {
4880      __ add(s2, s1, s2);
4881      __ add(s1, temp[5+i], s1);
4882    }
4883
4884    __ add(s2, s1, s2);
4885
4886    // Take the mod for s1 and s2
4887    __ set64(0xFFF1, L0, L1);
4888    __ udivx(s1, L0, L1);
4889    __ udivx(s2, L0, L2);
4890    __ mulx(L0, L1, L1);
4891    __ mulx(L0, L2, L2);
4892    __ sub(s1, L1, s1);
4893    __ sub(s2, L2, s2);
4894
4895    // Make sure there is something left to process
4896    __ BIND(L_main_loop_check);
4897    __ set64(NMAX, L0, L1);
4898    // k = len < NMAX ? len : NMAX
4899    __ cmp_and_br_short(len, L0, Assembler::greaterEqualUnsigned, Assembler::pt, L_nmax_check_done);
4900    __ andn(len, 0x0F, L0); // only loop a multiple of 16 times
4901    __ BIND(L_nmax_check_done);
4902    __ mov(L0, O5);
4903    __ sub(len, L0, len); // len -= k
4904
4905    __ srlx(O5, 4, O5); // multiplies of 16
4906    __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_main_loop);
4907
4908    // Restore anything we used, take the mod one last time, combine and return
4909    // Restore any registers we saved
4910    for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
4911      __ movdtox(as_FloatRegister(2*i), temp[i]);
4912    }
4913
4914    // There might be nothing left to process
4915    __ ba_short(L_cleanup_loop_check);
4916
4917    __ BIND(L_cleanup_loop);
4918    __ ldub(buff, 0, O4); // load single byte form buffer
4919    __ inc(buff); // buff++
4920    __ add(s1, O4, s1); // s1 += *buff++;
4921    __ dec(len); // len--
4922    __ add(s1, s2, s2); // s2 += s1;
4923    __ BIND(L_cleanup_loop_check);
4924    __ nop();
4925    __ cmp_and_br_short(len, 0, Assembler::notEqual, Assembler::pt, L_cleanup_loop);
4926
4927    // Take the mod one last time
4928    __ set64(0xFFF1, O1, O2);
4929    __ udivx(s1, O1, O2);
4930    __ udivx(s2, O1, O5);
4931    __ mulx(O1, O2, O2);
4932    __ mulx(O1, O5, O5);
4933    __ sub(s1, O2, s1);
4934    __ sub(s2, O5, s2);
4935
4936    // Combine lower bits and higher bits
4937    __ sllx(s2, 16, s2); // s2 = s2 << 16
4938    __ or3(s1, s2, s1);  // adler = s2 | s1
4939    // Final return value is in O0
4940    __ retl();
4941    __ delayed()->nop();
4942
4943    return start;
4944  }
4945
4946/**
4947   *  Arguments:
4948   *
4949   * Inputs:
4950   *   O0   - int   crc
4951   *   O1   - byte* buf
4952   *   O2   - int   len
4953   *   O3   - int*  table
4954   *
4955   * Output:
4956   *   O0   - int crc result
4957   */
4958  address generate_updateBytesCRC32() {
4959    assert(UseCRC32Intrinsics, "need VIS3 instructions");
4960
4961    __ align(CodeEntryAlignment);
4962    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4963    address start = __ pc();
4964
4965    const Register crc   = O0; // crc
4966    const Register buf   = O1; // source java byte array address
4967    const Register len   = O2; // length
4968    const Register table = O3; // crc_table address (reuse register)
4969
4970    __ kernel_crc32(crc, buf, len, table);
4971
4972    __ retl();
4973    __ delayed()->nop();
4974
4975    return start;
4976  }
4977
4978  void generate_initial() {
4979    // Generates all stubs and initializes the entry points
4980
4981    //------------------------------------------------------------------------------------------------------------------------
4982    // entry points that exist in all platforms
4983    // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
4984    //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
4985    StubRoutines::_forward_exception_entry                 = generate_forward_exception();
4986
4987    StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
4988    StubRoutines::_catch_exception_entry                   = generate_catch_exception();
4989
4990    //------------------------------------------------------------------------------------------------------------------------
4991    // entry points that are platform specific
4992    StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
4993
4994    StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
4995    StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
4996
4997    // Build this early so it's available for the interpreter.
4998    StubRoutines::_throw_StackOverflowError_entry =
4999            generate_throw_exception("StackOverflowError throw_exception",
5000            CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
5001    StubRoutines::_throw_delayed_StackOverflowError_entry =
5002            generate_throw_exception("delayed StackOverflowError throw_exception",
5003            CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError));
5004
5005    if (UseCRC32Intrinsics) {
5006      // set table address before stub generation which use it
5007      StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table;
5008      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5009    }
5010
5011    if (UseCRC32CIntrinsics) {
5012      // set table address before stub generation which use it
5013      StubRoutines::_crc32c_table_addr = (address)StubRoutines::Sparc::_crc32c_table;
5014      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5015    }
5016  }
5017
5018
5019  void generate_all() {
5020    // Generates all stubs and initializes the entry points
5021
5022    // Generate partial_subtype_check first here since its code depends on
5023    // UseZeroBaseCompressedOops which is defined after heap initialization.
5024    StubRoutines::Sparc::_partial_subtype_check                = generate_partial_subtype_check();
5025    // These entry points require SharedInfo::stack0 to be set up in non-core builds
5026    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
5027    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
5028    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
5029
5030    // support for verify_oop (must happen after universe_init)
5031    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop_subroutine();
5032
5033    // arraycopy stubs used by compilers
5034    generate_arraycopy_stubs();
5035
5036    // Don't initialize the platform math functions since sparc
5037    // doesn't have intrinsics for these operations.
5038
5039    // Safefetch stubs.
5040    generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5041                                                       &StubRoutines::_safefetch32_fault_pc,
5042                                                       &StubRoutines::_safefetch32_continuation_pc);
5043    generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5044                                                       &StubRoutines::_safefetchN_fault_pc,
5045                                                       &StubRoutines::_safefetchN_continuation_pc);
5046
5047    // generate AES intrinsics code
5048    if (UseAESIntrinsics) {
5049      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5050      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5051      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5052      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
5053    }
5054    // generate GHASH intrinsics code
5055    if (UseGHASHIntrinsics) {
5056      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5057    }
5058
5059    // generate SHA1/SHA256/SHA512 intrinsics code
5060    if (UseSHA1Intrinsics) {
5061      StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5062      StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5063    }
5064    if (UseSHA256Intrinsics) {
5065      StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5066      StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5067    }
5068    if (UseSHA512Intrinsics) {
5069      StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
5070      StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
5071    }
5072    // generate Adler32 intrinsics code
5073    if (UseAdler32Intrinsics) {
5074      StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5075    }
5076  }
5077
5078
5079 public:
5080  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5081    // replace the standard masm with a special one:
5082    _masm = new MacroAssembler(code);
5083
5084    _stub_count = !all ? 0x100 : 0x200;
5085    if (all) {
5086      generate_all();
5087    } else {
5088      generate_initial();
5089    }
5090
5091    // make sure this stub is available for all local calls
5092    if (_atomic_add_stub.is_unbound()) {
5093      // generate a second time, if necessary
5094      (void) generate_atomic_add();
5095    }
5096  }
5097
5098
5099 private:
5100  int _stub_count;
5101  void stub_prolog(StubCodeDesc* cdesc) {
5102    # ifdef ASSERT
5103      // put extra information in the stub code, to make it more readable
5104      // Write the high part of the address
5105      // [RGV] Check if there is a dependency on the size of this prolog
5106      __ emit_data((intptr_t)cdesc >> 32,    relocInfo::none);
5107      __ emit_data((intptr_t)cdesc,    relocInfo::none);
5108      __ emit_data(++_stub_count, relocInfo::none);
5109    # endif
5110    align(true);
5111  }
5112
5113  void align(bool at_header = false) {
5114    // %%%%% move this constant somewhere else
5115    // UltraSPARC cache line size is 8 instructions:
5116    const unsigned int icache_line_size = 32;
5117    const unsigned int icache_half_line_size = 16;
5118
5119    if (at_header) {
5120      while ((intptr_t)(__ pc()) % icache_line_size != 0) {
5121        __ emit_data(0, relocInfo::none);
5122      }
5123    } else {
5124      while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
5125        __ nop();
5126      }
5127    }
5128  }
5129
5130}; // end class declaration
5131
5132void StubGenerator_generate(CodeBuffer* code, bool all) {
5133  StubGenerator g(code, all);
5134}
5135