stubGenerator_aarch64.cpp revision 13249:a2753984d2c1
1/*
2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26#include "precompiled.hpp"
27#include "asm/macroAssembler.hpp"
28#include "asm/macroAssembler.inline.hpp"
29#include "interpreter/interpreter.hpp"
30#include "nativeInst_aarch64.hpp"
31#include "oops/instanceOop.hpp"
32#include "oops/method.hpp"
33#include "oops/objArrayKlass.hpp"
34#include "oops/oop.inline.hpp"
35#include "prims/methodHandles.hpp"
36#include "runtime/frame.inline.hpp"
37#include "runtime/handles.inline.hpp"
38#include "runtime/sharedRuntime.hpp"
39#include "runtime/stubCodeGenerator.hpp"
40#include "runtime/stubRoutines.hpp"
41#include "runtime/thread.inline.hpp"
42#include "utilities/align.hpp"
43#ifdef COMPILER2
44#include "opto/runtime.hpp"
45#endif
46
47#ifdef BUILTIN_SIM
48#include "../../../../../../simulator/simulator.hpp"
49#endif
50
51// Declaration and definition of StubGenerator (no .hpp file).
52// For a more detailed description of the stub routine structure
53// see the comment in stubRoutines.hpp
54
55#undef __
56#define __ _masm->
57#define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
58
59#ifdef PRODUCT
60#define BLOCK_COMMENT(str) /* nothing */
61#else
62#define BLOCK_COMMENT(str) __ block_comment(str)
63#endif
64
65#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
66
67// Stub Code definitions
68
69class StubGenerator: public StubCodeGenerator {
70 private:
71
72#ifdef PRODUCT
73#define inc_counter_np(counter) ((void)0)
74#else
75  void inc_counter_np_(int& counter) {
76    __ lea(rscratch2, ExternalAddress((address)&counter));
77    __ ldrw(rscratch1, Address(rscratch2));
78    __ addw(rscratch1, rscratch1, 1);
79    __ strw(rscratch1, Address(rscratch2));
80  }
81#define inc_counter_np(counter) \
82  BLOCK_COMMENT("inc_counter " #counter); \
83  inc_counter_np_(counter);
84#endif
85
86  // Call stubs are used to call Java from C
87  //
88  // Arguments:
89  //    c_rarg0:   call wrapper address                   address
90  //    c_rarg1:   result                                 address
91  //    c_rarg2:   result type                            BasicType
92  //    c_rarg3:   method                                 Method*
93  //    c_rarg4:   (interpreter) entry point              address
94  //    c_rarg5:   parameters                             intptr_t*
95  //    c_rarg6:   parameter size (in words)              int
96  //    c_rarg7:   thread                                 Thread*
97  //
98  // There is no return from the stub itself as any Java result
99  // is written to result
100  //
101  // we save r30 (lr) as the return PC at the base of the frame and
102  // link r29 (fp) below it as the frame pointer installing sp (r31)
103  // into fp.
104  //
105  // we save r0-r7, which accounts for all the c arguments.
106  //
107  // TODO: strictly do we need to save them all? they are treated as
108  // volatile by C so could we omit saving the ones we are going to
109  // place in global registers (thread? method?) or those we only use
110  // during setup of the Java call?
111  //
112  // we don't need to save r8 which C uses as an indirect result location
113  // return register.
114  //
115  // we don't need to save r9-r15 which both C and Java treat as
116  // volatile
117  //
118  // we don't need to save r16-18 because Java does not use them
119  //
120  // we save r19-r28 which Java uses as scratch registers and C
121  // expects to be callee-save
122  //
123  // we save the bottom 64 bits of each value stored in v8-v15; it is
124  // the responsibility of the caller to preserve larger values.
125  //
126  // so the stub frame looks like this when we enter Java code
127  //
128  //     [ return_from_Java     ] <--- sp
129  //     [ argument word n      ]
130  //      ...
131  // -27 [ argument word 1      ]
132  // -26 [ saved v15            ] <--- sp_after_call
133  // -25 [ saved v14            ]
134  // -24 [ saved v13            ]
135  // -23 [ saved v12            ]
136  // -22 [ saved v11            ]
137  // -21 [ saved v10            ]
138  // -20 [ saved v9             ]
139  // -19 [ saved v8             ]
140  // -18 [ saved r28            ]
141  // -17 [ saved r27            ]
142  // -16 [ saved r26            ]
143  // -15 [ saved r25            ]
144  // -14 [ saved r24            ]
145  // -13 [ saved r23            ]
146  // -12 [ saved r22            ]
147  // -11 [ saved r21            ]
148  // -10 [ saved r20            ]
149  //  -9 [ saved r19            ]
150  //  -8 [ call wrapper    (r0) ]
151  //  -7 [ result          (r1) ]
152  //  -6 [ result type     (r2) ]
153  //  -5 [ method          (r3) ]
154  //  -4 [ entry point     (r4) ]
155  //  -3 [ parameters      (r5) ]
156  //  -2 [ parameter size  (r6) ]
157  //  -1 [ thread (r7)          ]
158  //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
159  //   1 [ saved lr       (r30) ]
160
161  // Call stub stack layout word offsets from fp
162  enum call_stub_layout {
163    sp_after_call_off = -26,
164
165    d15_off            = -26,
166    d13_off            = -24,
167    d11_off            = -22,
168    d9_off             = -20,
169
170    r28_off            = -18,
171    r26_off            = -16,
172    r24_off            = -14,
173    r22_off            = -12,
174    r20_off            = -10,
175    call_wrapper_off   =  -8,
176    result_off         =  -7,
177    result_type_off    =  -6,
178    method_off         =  -5,
179    entry_point_off    =  -4,
180    parameter_size_off =  -2,
181    thread_off         =  -1,
182    fp_f               =   0,
183    retaddr_off        =   1,
184  };
185
186  address generate_call_stub(address& return_address) {
187    assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
188           (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
189           "adjust this code");
190
191    StubCodeMark mark(this, "StubRoutines", "call_stub");
192    address start = __ pc();
193
194    const Address sp_after_call(rfp, sp_after_call_off * wordSize);
195
196    const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
197    const Address result        (rfp, result_off         * wordSize);
198    const Address result_type   (rfp, result_type_off    * wordSize);
199    const Address method        (rfp, method_off         * wordSize);
200    const Address entry_point   (rfp, entry_point_off    * wordSize);
201    const Address parameter_size(rfp, parameter_size_off * wordSize);
202
203    const Address thread        (rfp, thread_off         * wordSize);
204
205    const Address d15_save      (rfp, d15_off * wordSize);
206    const Address d13_save      (rfp, d13_off * wordSize);
207    const Address d11_save      (rfp, d11_off * wordSize);
208    const Address d9_save       (rfp, d9_off * wordSize);
209
210    const Address r28_save      (rfp, r28_off * wordSize);
211    const Address r26_save      (rfp, r26_off * wordSize);
212    const Address r24_save      (rfp, r24_off * wordSize);
213    const Address r22_save      (rfp, r22_off * wordSize);
214    const Address r20_save      (rfp, r20_off * wordSize);
215
216    // stub code
217
218    // we need a C prolog to bootstrap the x86 caller into the sim
219    __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
220
221    address aarch64_entry = __ pc();
222
223#ifdef BUILTIN_SIM
224    // Save sender's SP for stack traces.
225    __ mov(rscratch1, sp);
226    __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
227#endif
228    // set up frame and move sp to end of save area
229    __ enter();
230    __ sub(sp, rfp, -sp_after_call_off * wordSize);
231
232    // save register parameters and Java scratch/global registers
233    // n.b. we save thread even though it gets installed in
234    // rthread because we want to sanity check rthread later
235    __ str(c_rarg7,  thread);
236    __ strw(c_rarg6, parameter_size);
237    __ stp(c_rarg4, c_rarg5,  entry_point);
238    __ stp(c_rarg2, c_rarg3,  result_type);
239    __ stp(c_rarg0, c_rarg1,  call_wrapper);
240
241    __ stp(r20, r19,   r20_save);
242    __ stp(r22, r21,   r22_save);
243    __ stp(r24, r23,   r24_save);
244    __ stp(r26, r25,   r26_save);
245    __ stp(r28, r27,   r28_save);
246
247    __ stpd(v9,  v8,   d9_save);
248    __ stpd(v11, v10,  d11_save);
249    __ stpd(v13, v12,  d13_save);
250    __ stpd(v15, v14,  d15_save);
251
252    // install Java thread in global register now we have saved
253    // whatever value it held
254    __ mov(rthread, c_rarg7);
255    // And method
256    __ mov(rmethod, c_rarg3);
257
258    // set up the heapbase register
259    __ reinit_heapbase();
260
261#ifdef ASSERT
262    // make sure we have no pending exceptions
263    {
264      Label L;
265      __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
266      __ cmp(rscratch1, (unsigned)NULL_WORD);
267      __ br(Assembler::EQ, L);
268      __ stop("StubRoutines::call_stub: entered with pending exception");
269      __ BIND(L);
270    }
271#endif
272    // pass parameters if any
273    __ mov(esp, sp);
274    __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
275    __ andr(sp, rscratch1, -2 * wordSize);
276
277    BLOCK_COMMENT("pass parameters if any");
278    Label parameters_done;
279    // parameter count is still in c_rarg6
280    // and parameter pointer identifying param 1 is in c_rarg5
281    __ cbzw(c_rarg6, parameters_done);
282
283    address loop = __ pc();
284    __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
285    __ subsw(c_rarg6, c_rarg6, 1);
286    __ push(rscratch1);
287    __ br(Assembler::GT, loop);
288
289    __ BIND(parameters_done);
290
291    // call Java entry -- passing methdoOop, and current sp
292    //      rmethod: Method*
293    //      r13: sender sp
294    BLOCK_COMMENT("call Java function");
295    __ mov(r13, sp);
296    __ blr(c_rarg4);
297
298    // tell the simulator we have returned to the stub
299
300    // we do this here because the notify will already have been done
301    // if we get to the next instruction via an exception
302    //
303    // n.b. adding this instruction here affects the calculation of
304    // whether or not a routine returns to the call stub (used when
305    // doing stack walks) since the normal test is to check the return
306    // pc against the address saved below. so we may need to allow for
307    // this extra instruction in the check.
308
309    if (NotifySimulator) {
310      __ notify(Assembler::method_reentry);
311    }
312    // save current address for use by exception handling code
313
314    return_address = __ pc();
315
316    // store result depending on type (everything that is not
317    // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
318    // n.b. this assumes Java returns an integral result in r0
319    // and a floating result in j_farg0
320    __ ldr(j_rarg2, result);
321    Label is_long, is_float, is_double, exit;
322    __ ldr(j_rarg1, result_type);
323    __ cmp(j_rarg1, T_OBJECT);
324    __ br(Assembler::EQ, is_long);
325    __ cmp(j_rarg1, T_LONG);
326    __ br(Assembler::EQ, is_long);
327    __ cmp(j_rarg1, T_FLOAT);
328    __ br(Assembler::EQ, is_float);
329    __ cmp(j_rarg1, T_DOUBLE);
330    __ br(Assembler::EQ, is_double);
331
332    // handle T_INT case
333    __ strw(r0, Address(j_rarg2));
334
335    __ BIND(exit);
336
337    // pop parameters
338    __ sub(esp, rfp, -sp_after_call_off * wordSize);
339
340#ifdef ASSERT
341    // verify that threads correspond
342    {
343      Label L, S;
344      __ ldr(rscratch1, thread);
345      __ cmp(rthread, rscratch1);
346      __ br(Assembler::NE, S);
347      __ get_thread(rscratch1);
348      __ cmp(rthread, rscratch1);
349      __ br(Assembler::EQ, L);
350      __ BIND(S);
351      __ stop("StubRoutines::call_stub: threads must correspond");
352      __ BIND(L);
353    }
354#endif
355
356    // restore callee-save registers
357    __ ldpd(v15, v14,  d15_save);
358    __ ldpd(v13, v12,  d13_save);
359    __ ldpd(v11, v10,  d11_save);
360    __ ldpd(v9,  v8,   d9_save);
361
362    __ ldp(r28, r27,   r28_save);
363    __ ldp(r26, r25,   r26_save);
364    __ ldp(r24, r23,   r24_save);
365    __ ldp(r22, r21,   r22_save);
366    __ ldp(r20, r19,   r20_save);
367
368    __ ldp(c_rarg0, c_rarg1,  call_wrapper);
369    __ ldrw(c_rarg2, result_type);
370    __ ldr(c_rarg3,  method);
371    __ ldp(c_rarg4, c_rarg5,  entry_point);
372    __ ldp(c_rarg6, c_rarg7,  parameter_size);
373
374#ifndef PRODUCT
375    // tell the simulator we are about to end Java execution
376    if (NotifySimulator) {
377      __ notify(Assembler::method_exit);
378    }
379#endif
380    // leave frame and return to caller
381    __ leave();
382    __ ret(lr);
383
384    // handle return types different from T_INT
385
386    __ BIND(is_long);
387    __ str(r0, Address(j_rarg2, 0));
388    __ br(Assembler::AL, exit);
389
390    __ BIND(is_float);
391    __ strs(j_farg0, Address(j_rarg2, 0));
392    __ br(Assembler::AL, exit);
393
394    __ BIND(is_double);
395    __ strd(j_farg0, Address(j_rarg2, 0));
396    __ br(Assembler::AL, exit);
397
398    return start;
399  }
400
401  // Return point for a Java call if there's an exception thrown in
402  // Java code.  The exception is caught and transformed into a
403  // pending exception stored in JavaThread that can be tested from
404  // within the VM.
405  //
406  // Note: Usually the parameters are removed by the callee. In case
407  // of an exception crossing an activation frame boundary, that is
408  // not the case if the callee is compiled code => need to setup the
409  // rsp.
410  //
411  // r0: exception oop
412
413  // NOTE: this is used as a target from the signal handler so it
414  // needs an x86 prolog which returns into the current simulator
415  // executing the generated catch_exception code. so the prolog
416  // needs to install rax in a sim register and adjust the sim's
417  // restart pc to enter the generated code at the start position
418  // then return from native to simulated execution.
419
420  address generate_catch_exception() {
421    StubCodeMark mark(this, "StubRoutines", "catch_exception");
422    address start = __ pc();
423
424    // same as in generate_call_stub():
425    const Address sp_after_call(rfp, sp_after_call_off * wordSize);
426    const Address thread        (rfp, thread_off         * wordSize);
427
428#ifdef ASSERT
429    // verify that threads correspond
430    {
431      Label L, S;
432      __ ldr(rscratch1, thread);
433      __ cmp(rthread, rscratch1);
434      __ br(Assembler::NE, S);
435      __ get_thread(rscratch1);
436      __ cmp(rthread, rscratch1);
437      __ br(Assembler::EQ, L);
438      __ bind(S);
439      __ stop("StubRoutines::catch_exception: threads must correspond");
440      __ bind(L);
441    }
442#endif
443
444    // set pending exception
445    __ verify_oop(r0);
446
447    __ str(r0, Address(rthread, Thread::pending_exception_offset()));
448    __ mov(rscratch1, (address)__FILE__);
449    __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
450    __ movw(rscratch1, (int)__LINE__);
451    __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
452
453    // complete return to VM
454    assert(StubRoutines::_call_stub_return_address != NULL,
455           "_call_stub_return_address must have been generated before");
456    __ b(StubRoutines::_call_stub_return_address);
457
458    return start;
459  }
460
461  // Continuation point for runtime calls returning with a pending
462  // exception.  The pending exception check happened in the runtime
463  // or native call stub.  The pending exception in Thread is
464  // converted into a Java-level exception.
465  //
466  // Contract with Java-level exception handlers:
467  // r0: exception
468  // r3: throwing pc
469  //
470  // NOTE: At entry of this stub, exception-pc must be in LR !!
471
472  // NOTE: this is always used as a jump target within generated code
473  // so it just needs to be generated code wiht no x86 prolog
474
475  address generate_forward_exception() {
476    StubCodeMark mark(this, "StubRoutines", "forward exception");
477    address start = __ pc();
478
479    // Upon entry, LR points to the return address returning into
480    // Java (interpreted or compiled) code; i.e., the return address
481    // becomes the throwing pc.
482    //
483    // Arguments pushed before the runtime call are still on the stack
484    // but the exception handler will reset the stack pointer ->
485    // ignore them.  A potential result in registers can be ignored as
486    // well.
487
488#ifdef ASSERT
489    // make sure this code is only executed if there is a pending exception
490    {
491      Label L;
492      __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
493      __ cbnz(rscratch1, L);
494      __ stop("StubRoutines::forward exception: no pending exception (1)");
495      __ bind(L);
496    }
497#endif
498
499    // compute exception handler into r19
500
501    // call the VM to find the handler address associated with the
502    // caller address. pass thread in r0 and caller pc (ret address)
503    // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
504    // the stack.
505    __ mov(c_rarg1, lr);
506    // lr will be trashed by the VM call so we move it to R19
507    // (callee-saved) because we also need to pass it to the handler
508    // returned by this call.
509    __ mov(r19, lr);
510    BLOCK_COMMENT("call exception_handler_for_return_address");
511    __ call_VM_leaf(CAST_FROM_FN_PTR(address,
512                         SharedRuntime::exception_handler_for_return_address),
513                    rthread, c_rarg1);
514    // we should not really care that lr is no longer the callee
515    // address. we saved the value the handler needs in r19 so we can
516    // just copy it to r3. however, the C2 handler will push its own
517    // frame and then calls into the VM and the VM code asserts that
518    // the PC for the frame above the handler belongs to a compiled
519    // Java method. So, we restore lr here to satisfy that assert.
520    __ mov(lr, r19);
521    // setup r0 & r3 & clear pending exception
522    __ mov(r3, r19);
523    __ mov(r19, r0);
524    __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
525    __ str(zr, Address(rthread, Thread::pending_exception_offset()));
526
527#ifdef ASSERT
528    // make sure exception is set
529    {
530      Label L;
531      __ cbnz(r0, L);
532      __ stop("StubRoutines::forward exception: no pending exception (2)");
533      __ bind(L);
534    }
535#endif
536
537    // continue at exception handler
538    // r0: exception
539    // r3: throwing pc
540    // r19: exception handler
541    __ verify_oop(r0);
542    __ br(r19);
543
544    return start;
545  }
546
547  // Non-destructive plausibility checks for oops
548  //
549  // Arguments:
550  //    r0: oop to verify
551  //    rscratch1: error message
552  //
553  // Stack after saving c_rarg3:
554  //    [tos + 0]: saved c_rarg3
555  //    [tos + 1]: saved c_rarg2
556  //    [tos + 2]: saved lr
557  //    [tos + 3]: saved rscratch2
558  //    [tos + 4]: saved r0
559  //    [tos + 5]: saved rscratch1
560  address generate_verify_oop() {
561
562    StubCodeMark mark(this, "StubRoutines", "verify_oop");
563    address start = __ pc();
564
565    Label exit, error;
566
567    // save c_rarg2 and c_rarg3
568    __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
569
570    // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
571    __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
572    __ ldr(c_rarg3, Address(c_rarg2));
573    __ add(c_rarg3, c_rarg3, 1);
574    __ str(c_rarg3, Address(c_rarg2));
575
576    // object is in r0
577    // make sure object is 'reasonable'
578    __ cbz(r0, exit); // if obj is NULL it is OK
579
580    // Check if the oop is in the right area of memory
581    __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
582    __ andr(c_rarg2, r0, c_rarg3);
583    __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
584
585    // Compare c_rarg2 and c_rarg3.  We don't use a compare
586    // instruction here because the flags register is live.
587    __ eor(c_rarg2, c_rarg2, c_rarg3);
588    __ cbnz(c_rarg2, error);
589
590    // make sure klass is 'reasonable', which is not zero.
591    __ load_klass(r0, r0);  // get klass
592    __ cbz(r0, error);      // if klass is NULL it is broken
593
594    // return if everything seems ok
595    __ bind(exit);
596
597    __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
598    __ ret(lr);
599
600    // handle errors
601    __ bind(error);
602    __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
603
604    __ push(RegSet::range(r0, r29), sp);
605    // debug(char* msg, int64_t pc, int64_t regs[])
606    __ mov(c_rarg0, rscratch1);      // pass address of error message
607    __ mov(c_rarg1, lr);             // pass return address
608    __ mov(c_rarg2, sp);             // pass address of regs on stack
609#ifndef PRODUCT
610    assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
611#endif
612    BLOCK_COMMENT("call MacroAssembler::debug");
613    __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
614    __ blrt(rscratch1, 3, 0, 1);
615
616    return start;
617  }
618
619  void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
620
621  // Generate code for an array write pre barrier
622  //
623  //     addr    -  starting address
624  //     count   -  element count
625  //     tmp     - scratch register
626  //
627  //     Destroy no registers except rscratch1 and rscratch2
628  //
629  void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
630    BarrierSet* bs = Universe::heap()->barrier_set();
631    switch (bs->kind()) {
632    case BarrierSet::G1SATBCTLogging:
633      // With G1, don't generate the call if we statically know that the target in uninitialized
634      if (!dest_uninitialized) {
635        __ push_call_clobbered_registers();
636        if (count == c_rarg0) {
637          if (addr == c_rarg1) {
638            // exactly backwards!!
639            __ mov(rscratch1, c_rarg0);
640            __ mov(c_rarg0, c_rarg1);
641            __ mov(c_rarg1, rscratch1);
642          } else {
643            __ mov(c_rarg1, count);
644            __ mov(c_rarg0, addr);
645          }
646        } else {
647          __ mov(c_rarg0, addr);
648          __ mov(c_rarg1, count);
649        }
650        __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
651        __ pop_call_clobbered_registers();
652        break;
653      case BarrierSet::CardTableForRS:
654      case BarrierSet::CardTableExtension:
655      case BarrierSet::ModRef:
656        break;
657      default:
658        ShouldNotReachHere();
659
660      }
661    }
662  }
663
664  //
665  // Generate code for an array write post barrier
666  //
667  //  Input:
668  //     start    - register containing starting address of destination array
669  //     end      - register containing ending address of destination array
670  //     scratch  - scratch register
671  //
672  //  The input registers are overwritten.
673  //  The ending address is inclusive.
674  void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
675    assert_different_registers(start, end, scratch);
676    BarrierSet* bs = Universe::heap()->barrier_set();
677    switch (bs->kind()) {
678      case BarrierSet::G1SATBCTLogging:
679
680        {
681          __ push_call_clobbered_registers();
682          // must compute element count unless barrier set interface is changed (other platforms supply count)
683          assert_different_registers(start, end, scratch);
684          __ lea(scratch, Address(end, BytesPerHeapOop));
685          __ sub(scratch, scratch, start);               // subtract start to get #bytes
686          __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
687          __ mov(c_rarg0, start);
688          __ mov(c_rarg1, scratch);
689          __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
690          __ pop_call_clobbered_registers();
691        }
692        break;
693      case BarrierSet::CardTableForRS:
694      case BarrierSet::CardTableExtension:
695        {
696          CardTableModRefBS* ct = (CardTableModRefBS*)bs;
697          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
698
699          Label L_loop;
700
701           __ lsr(start, start, CardTableModRefBS::card_shift);
702           __ lsr(end, end, CardTableModRefBS::card_shift);
703           __ sub(end, end, start); // number of bytes to copy
704
705          const Register count = end; // 'end' register contains bytes count now
706          __ load_byte_map_base(scratch);
707          __ add(start, start, scratch);
708          if (UseConcMarkSweepGC) {
709            __ membar(__ StoreStore);
710          }
711          __ BIND(L_loop);
712          __ strb(zr, Address(start, count));
713          __ subs(count, count, 1);
714          __ br(Assembler::GE, L_loop);
715        }
716        break;
717      default:
718        ShouldNotReachHere();
719
720    }
721  }
722
723  // The inner part of zero_words().  This is the bulk operation,
724  // zeroing words in blocks, possibly using DC ZVA to do it.  The
725  // caller is responsible for zeroing the last few words.
726  //
727  // Inputs:
728  // r10: the HeapWord-aligned base address of an array to zero.
729  // r11: the count in HeapWords, r11 > 0.
730  //
731  // Returns r10 and r11, adjusted for the caller to clear.
732  // r10: the base address of the tail of words left to clear.
733  // r11: the number of words in the tail.
734  //      r11 < MacroAssembler::zero_words_block_size.
735
736  address generate_zero_blocks() {
737    Label store_pair, loop_store_pair, done;
738    Label base_aligned;
739
740    Register base = r10, cnt = r11;
741
742    __ align(CodeEntryAlignment);
743    StubCodeMark mark(this, "StubRoutines", "zero_blocks");
744    address start = __ pc();
745
746    if (UseBlockZeroing) {
747      int zva_length = VM_Version::zva_length();
748
749      // Ensure ZVA length can be divided by 16. This is required by
750      // the subsequent operations.
751      assert (zva_length % 16 == 0, "Unexpected ZVA Length");
752
753      __ tbz(base, 3, base_aligned);
754      __ str(zr, Address(__ post(base, 8)));
755      __ sub(cnt, cnt, 1);
756      __ bind(base_aligned);
757
758      // Ensure count >= zva_length * 2 so that it still deserves a zva after
759      // alignment.
760      Label small;
761      int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
762      __ cmp(cnt, low_limit >> 3);
763      __ br(Assembler::LT, small);
764      __ zero_dcache_blocks(base, cnt);
765      __ bind(small);
766    }
767
768    {
769      // Number of stp instructions we'll unroll
770      const int unroll =
771        MacroAssembler::zero_words_block_size / 2;
772      // Clear the remaining blocks.
773      Label loop;
774      __ subs(cnt, cnt, unroll * 2);
775      __ br(Assembler::LT, done);
776      __ bind(loop);
777      for (int i = 0; i < unroll; i++)
778        __ stp(zr, zr, __ post(base, 16));
779      __ subs(cnt, cnt, unroll * 2);
780      __ br(Assembler::GE, loop);
781      __ bind(done);
782      __ add(cnt, cnt, unroll * 2);
783    }
784
785    __ ret(lr);
786
787    return start;
788  }
789
790
791  typedef enum {
792    copy_forwards = 1,
793    copy_backwards = -1
794  } copy_direction;
795
796  // Bulk copy of blocks of 8 words.
797  //
798  // count is a count of words.
799  //
800  // Precondition: count >= 8
801  //
802  // Postconditions:
803  //
804  // The least significant bit of count contains the remaining count
805  // of words to copy.  The rest of count is trash.
806  //
807  // s and d are adjusted to point to the remaining words to copy
808  //
809  void generate_copy_longs(Label &start, Register s, Register d, Register count,
810                           copy_direction direction) {
811    int unit = wordSize * direction;
812    int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
813
814    int offset;
815    const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
816      t4 = r7, t5 = r10, t6 = r11, t7 = r12;
817    const Register stride = r13;
818
819    assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
820    assert_different_registers(s, d, count, rscratch1);
821
822    Label again, drain;
823    const char *stub_name;
824    if (direction == copy_forwards)
825      stub_name = "foward_copy_longs";
826    else
827      stub_name = "backward_copy_longs";
828    StubCodeMark mark(this, "StubRoutines", stub_name);
829    __ align(CodeEntryAlignment);
830    __ bind(start);
831
832    Label unaligned_copy_long;
833    if (AvoidUnalignedAccesses) {
834      __ tbnz(d, 3, unaligned_copy_long);
835    }
836
837    if (direction == copy_forwards) {
838      __ sub(s, s, bias);
839      __ sub(d, d, bias);
840    }
841
842#ifdef ASSERT
843    // Make sure we are never given < 8 words
844    {
845      Label L;
846      __ cmp(count, 8);
847      __ br(Assembler::GE, L);
848      __ stop("genrate_copy_longs called with < 8 words");
849      __ bind(L);
850    }
851#endif
852
853    // Fill 8 registers
854    if (UseSIMDForMemoryOps) {
855      __ ldpq(v0, v1, Address(s, 4 * unit));
856      __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
857    } else {
858      __ ldp(t0, t1, Address(s, 2 * unit));
859      __ ldp(t2, t3, Address(s, 4 * unit));
860      __ ldp(t4, t5, Address(s, 6 * unit));
861      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
862    }
863
864    __ subs(count, count, 16);
865    __ br(Assembler::LO, drain);
866
867    int prefetch = PrefetchCopyIntervalInBytes;
868    bool use_stride = false;
869    if (direction == copy_backwards) {
870       use_stride = prefetch > 256;
871       prefetch = -prefetch;
872       if (use_stride) __ mov(stride, prefetch);
873    }
874
875    __ bind(again);
876
877    if (PrefetchCopyIntervalInBytes > 0)
878      __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
879
880    if (UseSIMDForMemoryOps) {
881      __ stpq(v0, v1, Address(d, 4 * unit));
882      __ ldpq(v0, v1, Address(s, 4 * unit));
883      __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
884      __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
885    } else {
886      __ stp(t0, t1, Address(d, 2 * unit));
887      __ ldp(t0, t1, Address(s, 2 * unit));
888      __ stp(t2, t3, Address(d, 4 * unit));
889      __ ldp(t2, t3, Address(s, 4 * unit));
890      __ stp(t4, t5, Address(d, 6 * unit));
891      __ ldp(t4, t5, Address(s, 6 * unit));
892      __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
893      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
894    }
895
896    __ subs(count, count, 8);
897    __ br(Assembler::HS, again);
898
899    // Drain
900    __ bind(drain);
901    if (UseSIMDForMemoryOps) {
902      __ stpq(v0, v1, Address(d, 4 * unit));
903      __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
904    } else {
905      __ stp(t0, t1, Address(d, 2 * unit));
906      __ stp(t2, t3, Address(d, 4 * unit));
907      __ stp(t4, t5, Address(d, 6 * unit));
908      __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
909    }
910
911    {
912      Label L1, L2;
913      __ tbz(count, exact_log2(4), L1);
914      if (UseSIMDForMemoryOps) {
915        __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
916        __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
917      } else {
918        __ ldp(t0, t1, Address(s, 2 * unit));
919        __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
920        __ stp(t0, t1, Address(d, 2 * unit));
921        __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
922      }
923      __ bind(L1);
924
925      if (direction == copy_forwards) {
926        __ add(s, s, bias);
927        __ add(d, d, bias);
928      }
929
930      __ tbz(count, 1, L2);
931      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
932      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
933      __ bind(L2);
934    }
935
936    __ ret(lr);
937
938    if (AvoidUnalignedAccesses) {
939      Label drain, again;
940      // Register order for storing. Order is different for backward copy.
941
942      __ bind(unaligned_copy_long);
943
944      // source address is even aligned, target odd aligned
945      //
946      // when forward copying word pairs we read long pairs at offsets
947      // {0, 2, 4, 6} (in long words). when backwards copying we read
948      // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
949      // address by -2 in the forwards case so we can compute the
950      // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
951      // or -1.
952      //
953      // when forward copying we need to store 1 word, 3 pairs and
954      // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
955      // zero offset We adjust the destination by -1 which means we
956      // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
957      //
958      // When backwards copyng we need to store 1 word, 3 pairs and
959      // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
960      // offsets {1, 3, 5, 7, 8} * unit.
961
962      if (direction == copy_forwards) {
963        __ sub(s, s, 16);
964        __ sub(d, d, 8);
965      }
966
967      // Fill 8 registers
968      //
969      // for forwards copy s was offset by -16 from the original input
970      // value of s so the register contents are at these offsets
971      // relative to the 64 bit block addressed by that original input
972      // and so on for each successive 64 byte block when s is updated
973      //
974      // t0 at offset 0,  t1 at offset 8
975      // t2 at offset 16, t3 at offset 24
976      // t4 at offset 32, t5 at offset 40
977      // t6 at offset 48, t7 at offset 56
978
979      // for backwards copy s was not offset so the register contents
980      // are at these offsets into the preceding 64 byte block
981      // relative to that original input and so on for each successive
982      // preceding 64 byte block when s is updated. this explains the
983      // slightly counter-intuitive looking pattern of register usage
984      // in the stp instructions for backwards copy.
985      //
986      // t0 at offset -16, t1 at offset -8
987      // t2 at offset -32, t3 at offset -24
988      // t4 at offset -48, t5 at offset -40
989      // t6 at offset -64, t7 at offset -56
990
991      __ ldp(t0, t1, Address(s, 2 * unit));
992      __ ldp(t2, t3, Address(s, 4 * unit));
993      __ ldp(t4, t5, Address(s, 6 * unit));
994      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
995
996      __ subs(count, count, 16);
997      __ br(Assembler::LO, drain);
998
999      int prefetch = PrefetchCopyIntervalInBytes;
1000      bool use_stride = false;
1001      if (direction == copy_backwards) {
1002         use_stride = prefetch > 256;
1003         prefetch = -prefetch;
1004         if (use_stride) __ mov(stride, prefetch);
1005      }
1006
1007      __ bind(again);
1008
1009      if (PrefetchCopyIntervalInBytes > 0)
1010        __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1011
1012      if (direction == copy_forwards) {
1013       // allowing for the offset of -8 the store instructions place
1014       // registers into the target 64 bit block at the following
1015       // offsets
1016       //
1017       // t0 at offset 0
1018       // t1 at offset 8,  t2 at offset 16
1019       // t3 at offset 24, t4 at offset 32
1020       // t5 at offset 40, t6 at offset 48
1021       // t7 at offset 56
1022
1023        __ str(t0, Address(d, 1 * unit));
1024        __ stp(t1, t2, Address(d, 2 * unit));
1025        __ ldp(t0, t1, Address(s, 2 * unit));
1026        __ stp(t3, t4, Address(d, 4 * unit));
1027        __ ldp(t2, t3, Address(s, 4 * unit));
1028        __ stp(t5, t6, Address(d, 6 * unit));
1029        __ ldp(t4, t5, Address(s, 6 * unit));
1030        __ str(t7, Address(__ pre(d, 8 * unit)));
1031        __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1032      } else {
1033       // d was not offset when we started so the registers are
1034       // written into the 64 bit block preceding d with the following
1035       // offsets
1036       //
1037       // t1 at offset -8
1038       // t3 at offset -24, t0 at offset -16
1039       // t5 at offset -48, t2 at offset -32
1040       // t7 at offset -56, t4 at offset -48
1041       //                   t6 at offset -64
1042       //
1043       // note that this matches the offsets previously noted for the
1044       // loads
1045
1046        __ str(t1, Address(d, 1 * unit));
1047        __ stp(t3, t0, Address(d, 3 * unit));
1048        __ ldp(t0, t1, Address(s, 2 * unit));
1049        __ stp(t5, t2, Address(d, 5 * unit));
1050        __ ldp(t2, t3, Address(s, 4 * unit));
1051        __ stp(t7, t4, Address(d, 7 * unit));
1052        __ ldp(t4, t5, Address(s, 6 * unit));
1053        __ str(t6, Address(__ pre(d, 8 * unit)));
1054        __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1055      }
1056
1057      __ subs(count, count, 8);
1058      __ br(Assembler::HS, again);
1059
1060      // Drain
1061      //
1062      // this uses the same pattern of offsets and register arguments
1063      // as above
1064      __ bind(drain);
1065      if (direction == copy_forwards) {
1066        __ str(t0, Address(d, 1 * unit));
1067        __ stp(t1, t2, Address(d, 2 * unit));
1068        __ stp(t3, t4, Address(d, 4 * unit));
1069        __ stp(t5, t6, Address(d, 6 * unit));
1070        __ str(t7, Address(__ pre(d, 8 * unit)));
1071      } else {
1072        __ str(t1, Address(d, 1 * unit));
1073        __ stp(t3, t0, Address(d, 3 * unit));
1074        __ stp(t5, t2, Address(d, 5 * unit));
1075        __ stp(t7, t4, Address(d, 7 * unit));
1076        __ str(t6, Address(__ pre(d, 8 * unit)));
1077      }
1078      // now we need to copy any remaining part block which may
1079      // include a 4 word block subblock and/or a 2 word subblock.
1080      // bits 2 and 1 in the count are the tell-tale for whetehr we
1081      // have each such subblock
1082      {
1083        Label L1, L2;
1084        __ tbz(count, exact_log2(4), L1);
1085       // this is the same as above but copying only 4 longs hence
1086       // with ony one intervening stp between the str instructions
1087       // but note that the offsets and registers still follow the
1088       // same pattern
1089        __ ldp(t0, t1, Address(s, 2 * unit));
1090        __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1091        if (direction == copy_forwards) {
1092          __ str(t0, Address(d, 1 * unit));
1093          __ stp(t1, t2, Address(d, 2 * unit));
1094          __ str(t3, Address(__ pre(d, 4 * unit)));
1095        } else {
1096          __ str(t1, Address(d, 1 * unit));
1097          __ stp(t3, t0, Address(d, 3 * unit));
1098          __ str(t2, Address(__ pre(d, 4 * unit)));
1099        }
1100        __ bind(L1);
1101
1102        __ tbz(count, 1, L2);
1103       // this is the same as above but copying only 2 longs hence
1104       // there is no intervening stp between the str instructions
1105       // but note that the offset and register patterns are still
1106       // the same
1107        __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1108        if (direction == copy_forwards) {
1109          __ str(t0, Address(d, 1 * unit));
1110          __ str(t1, Address(__ pre(d, 2 * unit)));
1111        } else {
1112          __ str(t1, Address(d, 1 * unit));
1113          __ str(t0, Address(__ pre(d, 2 * unit)));
1114        }
1115        __ bind(L2);
1116
1117       // for forwards copy we need to re-adjust the offsets we
1118       // applied so that s and d are follow the last words written
1119
1120       if (direction == copy_forwards) {
1121         __ add(s, s, 16);
1122         __ add(d, d, 8);
1123       }
1124
1125      }
1126
1127      __ ret(lr);
1128      }
1129  }
1130
1131  // Small copy: less than 16 bytes.
1132  //
1133  // NB: Ignores all of the bits of count which represent more than 15
1134  // bytes, so a caller doesn't have to mask them.
1135
1136  void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1137    bool is_backwards = step < 0;
1138    size_t granularity = uabs(step);
1139    int direction = is_backwards ? -1 : 1;
1140    int unit = wordSize * direction;
1141
1142    Label Lpair, Lword, Lint, Lshort, Lbyte;
1143
1144    assert(granularity
1145           && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1146
1147    const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1148
1149    // ??? I don't know if this bit-test-and-branch is the right thing
1150    // to do.  It does a lot of jumping, resulting in several
1151    // mispredicted branches.  It might make more sense to do this
1152    // with something like Duff's device with a single computed branch.
1153
1154    __ tbz(count, 3 - exact_log2(granularity), Lword);
1155    __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1156    __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1157    __ bind(Lword);
1158
1159    if (granularity <= sizeof (jint)) {
1160      __ tbz(count, 2 - exact_log2(granularity), Lint);
1161      __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1162      __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1163      __ bind(Lint);
1164    }
1165
1166    if (granularity <= sizeof (jshort)) {
1167      __ tbz(count, 1 - exact_log2(granularity), Lshort);
1168      __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1169      __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1170      __ bind(Lshort);
1171    }
1172
1173    if (granularity <= sizeof (jbyte)) {
1174      __ tbz(count, 0, Lbyte);
1175      __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1176      __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1177      __ bind(Lbyte);
1178    }
1179  }
1180
1181  Label copy_f, copy_b;
1182
1183  // All-singing all-dancing memory copy.
1184  //
1185  // Copy count units of memory from s to d.  The size of a unit is
1186  // step, which can be positive or negative depending on the direction
1187  // of copy.  If is_aligned is false, we align the source address.
1188  //
1189
1190  void copy_memory(bool is_aligned, Register s, Register d,
1191                   Register count, Register tmp, int step) {
1192    copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1193    bool is_backwards = step < 0;
1194    int granularity = uabs(step);
1195    const Register t0 = r3, t1 = r4;
1196
1197    // <= 96 bytes do inline. Direction doesn't matter because we always
1198    // load all the data before writing anything
1199    Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1200    const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1201    const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1202    const Register send = r17, dend = r18;
1203
1204    if (PrefetchCopyIntervalInBytes > 0)
1205      __ prfm(Address(s, 0), PLDL1KEEP);
1206    __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1207    __ br(Assembler::HI, copy_big);
1208
1209    __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1210    __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1211
1212    __ cmp(count, 16/granularity);
1213    __ br(Assembler::LS, copy16);
1214
1215    __ cmp(count, 64/granularity);
1216    __ br(Assembler::HI, copy80);
1217
1218    __ cmp(count, 32/granularity);
1219    __ br(Assembler::LS, copy32);
1220
1221    // 33..64 bytes
1222    if (UseSIMDForMemoryOps) {
1223      __ ldpq(v0, v1, Address(s, 0));
1224      __ ldpq(v2, v3, Address(send, -32));
1225      __ stpq(v0, v1, Address(d, 0));
1226      __ stpq(v2, v3, Address(dend, -32));
1227    } else {
1228      __ ldp(t0, t1, Address(s, 0));
1229      __ ldp(t2, t3, Address(s, 16));
1230      __ ldp(t4, t5, Address(send, -32));
1231      __ ldp(t6, t7, Address(send, -16));
1232
1233      __ stp(t0, t1, Address(d, 0));
1234      __ stp(t2, t3, Address(d, 16));
1235      __ stp(t4, t5, Address(dend, -32));
1236      __ stp(t6, t7, Address(dend, -16));
1237    }
1238    __ b(finish);
1239
1240    // 17..32 bytes
1241    __ bind(copy32);
1242    __ ldp(t0, t1, Address(s, 0));
1243    __ ldp(t2, t3, Address(send, -16));
1244    __ stp(t0, t1, Address(d, 0));
1245    __ stp(t2, t3, Address(dend, -16));
1246    __ b(finish);
1247
1248    // 65..80/96 bytes
1249    // (96 bytes if SIMD because we do 32 byes per instruction)
1250    __ bind(copy80);
1251    if (UseSIMDForMemoryOps) {
1252      __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1253      __ ldpq(v4, v5, Address(send, -32));
1254      __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1255      __ stpq(v4, v5, Address(dend, -32));
1256    } else {
1257      __ ldp(t0, t1, Address(s, 0));
1258      __ ldp(t2, t3, Address(s, 16));
1259      __ ldp(t4, t5, Address(s, 32));
1260      __ ldp(t6, t7, Address(s, 48));
1261      __ ldp(t8, t9, Address(send, -16));
1262
1263      __ stp(t0, t1, Address(d, 0));
1264      __ stp(t2, t3, Address(d, 16));
1265      __ stp(t4, t5, Address(d, 32));
1266      __ stp(t6, t7, Address(d, 48));
1267      __ stp(t8, t9, Address(dend, -16));
1268    }
1269    __ b(finish);
1270
1271    // 0..16 bytes
1272    __ bind(copy16);
1273    __ cmp(count, 8/granularity);
1274    __ br(Assembler::LO, copy8);
1275
1276    // 8..16 bytes
1277    __ ldr(t0, Address(s, 0));
1278    __ ldr(t1, Address(send, -8));
1279    __ str(t0, Address(d, 0));
1280    __ str(t1, Address(dend, -8));
1281    __ b(finish);
1282
1283    if (granularity < 8) {
1284      // 4..7 bytes
1285      __ bind(copy8);
1286      __ tbz(count, 2 - exact_log2(granularity), copy4);
1287      __ ldrw(t0, Address(s, 0));
1288      __ ldrw(t1, Address(send, -4));
1289      __ strw(t0, Address(d, 0));
1290      __ strw(t1, Address(dend, -4));
1291      __ b(finish);
1292      if (granularity < 4) {
1293        // 0..3 bytes
1294        __ bind(copy4);
1295        __ cbz(count, finish); // get rid of 0 case
1296        if (granularity == 2) {
1297          __ ldrh(t0, Address(s, 0));
1298          __ strh(t0, Address(d, 0));
1299        } else { // granularity == 1
1300          // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1301          // the first and last byte.
1302          // Handle the 3 byte case by loading and storing base + count/2
1303          // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1304          // This does means in the 1 byte case we load/store the same
1305          // byte 3 times.
1306          __ lsr(count, count, 1);
1307          __ ldrb(t0, Address(s, 0));
1308          __ ldrb(t1, Address(send, -1));
1309          __ ldrb(t2, Address(s, count));
1310          __ strb(t0, Address(d, 0));
1311          __ strb(t1, Address(dend, -1));
1312          __ strb(t2, Address(d, count));
1313        }
1314        __ b(finish);
1315      }
1316    }
1317
1318    __ bind(copy_big);
1319    if (is_backwards) {
1320      __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1321      __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1322    }
1323
1324    // Now we've got the small case out of the way we can align the
1325    // source address on a 2-word boundary.
1326
1327    Label aligned;
1328
1329    if (is_aligned) {
1330      // We may have to adjust by 1 word to get s 2-word-aligned.
1331      __ tbz(s, exact_log2(wordSize), aligned);
1332      __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1333      __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1334      __ sub(count, count, wordSize/granularity);
1335    } else {
1336      if (is_backwards) {
1337        __ andr(rscratch2, s, 2 * wordSize - 1);
1338      } else {
1339        __ neg(rscratch2, s);
1340        __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1341      }
1342      // rscratch2 is the byte adjustment needed to align s.
1343      __ cbz(rscratch2, aligned);
1344      int shift = exact_log2(granularity);
1345      if (shift)  __ lsr(rscratch2, rscratch2, shift);
1346      __ sub(count, count, rscratch2);
1347
1348#if 0
1349      // ?? This code is only correct for a disjoint copy.  It may or
1350      // may not make sense to use it in that case.
1351
1352      // Copy the first pair; s and d may not be aligned.
1353      __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1354      __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1355
1356      // Align s and d, adjust count
1357      if (is_backwards) {
1358        __ sub(s, s, rscratch2);
1359        __ sub(d, d, rscratch2);
1360      } else {
1361        __ add(s, s, rscratch2);
1362        __ add(d, d, rscratch2);
1363      }
1364#else
1365      copy_memory_small(s, d, rscratch2, rscratch1, step);
1366#endif
1367    }
1368
1369    __ bind(aligned);
1370
1371    // s is now 2-word-aligned.
1372
1373    // We have a count of units and some trailing bytes.  Adjust the
1374    // count and do a bulk copy of words.
1375    __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1376    if (direction == copy_forwards)
1377      __ bl(copy_f);
1378    else
1379      __ bl(copy_b);
1380
1381    // And the tail.
1382    copy_memory_small(s, d, count, tmp, step);
1383
1384    if (granularity >= 8) __ bind(copy8);
1385    if (granularity >= 4) __ bind(copy4);
1386    __ bind(finish);
1387  }
1388
1389
1390  void clobber_registers() {
1391#ifdef ASSERT
1392    __ mov(rscratch1, (uint64_t)0xdeadbeef);
1393    __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1394    for (Register r = r3; r <= r18; r++)
1395      if (r != rscratch1) __ mov(r, rscratch1);
1396#endif
1397  }
1398
1399  // Scan over array at a for count oops, verifying each one.
1400  // Preserves a and count, clobbers rscratch1 and rscratch2.
1401  void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1402    Label loop, end;
1403    __ mov(rscratch1, a);
1404    __ mov(rscratch2, zr);
1405    __ bind(loop);
1406    __ cmp(rscratch2, count);
1407    __ br(Assembler::HS, end);
1408    if (size == (size_t)wordSize) {
1409      __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1410      __ verify_oop(temp);
1411    } else {
1412      __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1413      __ decode_heap_oop(temp); // calls verify_oop
1414    }
1415    __ add(rscratch2, rscratch2, size);
1416    __ b(loop);
1417    __ bind(end);
1418  }
1419
1420  // Arguments:
1421  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1422  //             ignored
1423  //   is_oop  - true => oop array, so generate store check code
1424  //   name    - stub name string
1425  //
1426  // Inputs:
1427  //   c_rarg0   - source array address
1428  //   c_rarg1   - destination array address
1429  //   c_rarg2   - element count, treated as ssize_t, can be zero
1430  //
1431  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1432  // the hardware handle it.  The two dwords within qwords that span
1433  // cache line boundaries will still be loaded and stored atomicly.
1434  //
1435  // Side Effects:
1436  //   disjoint_int_copy_entry is set to the no-overlap entry point
1437  //   used by generate_conjoint_int_oop_copy().
1438  //
1439  address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1440                                  const char *name, bool dest_uninitialized = false) {
1441    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1442    __ align(CodeEntryAlignment);
1443    StubCodeMark mark(this, "StubRoutines", name);
1444    address start = __ pc();
1445    __ enter();
1446
1447    if (entry != NULL) {
1448      *entry = __ pc();
1449      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1450      BLOCK_COMMENT("Entry:");
1451    }
1452
1453    if (is_oop) {
1454      __ push(RegSet::of(d, count), sp);
1455      // no registers are destroyed by this call
1456      gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1457    }
1458    copy_memory(aligned, s, d, count, rscratch1, size);
1459    if (is_oop) {
1460      __ pop(RegSet::of(d, count), sp);
1461      if (VerifyOops)
1462        verify_oop_array(size, d, count, r16);
1463      __ sub(count, count, 1); // make an inclusive end pointer
1464      __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1465      gen_write_ref_array_post_barrier(d, count, rscratch1);
1466    }
1467    __ leave();
1468    __ mov(r0, zr); // return 0
1469    __ ret(lr);
1470#ifdef BUILTIN_SIM
1471    {
1472      AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1473      sim->notifyCompile(const_cast<char*>(name), start);
1474    }
1475#endif
1476    return start;
1477  }
1478
1479  // Arguments:
1480  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1481  //             ignored
1482  //   is_oop  - true => oop array, so generate store check code
1483  //   name    - stub name string
1484  //
1485  // Inputs:
1486  //   c_rarg0   - source array address
1487  //   c_rarg1   - destination array address
1488  //   c_rarg2   - element count, treated as ssize_t, can be zero
1489  //
1490  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1491  // the hardware handle it.  The two dwords within qwords that span
1492  // cache line boundaries will still be loaded and stored atomicly.
1493  //
1494  address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1495                                 address *entry, const char *name,
1496                                 bool dest_uninitialized = false) {
1497    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1498
1499    StubCodeMark mark(this, "StubRoutines", name);
1500    address start = __ pc();
1501    __ enter();
1502
1503    if (entry != NULL) {
1504      *entry = __ pc();
1505      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1506      BLOCK_COMMENT("Entry:");
1507    }
1508
1509    // use fwd copy when (d-s) above_equal (count*size)
1510    __ sub(rscratch1, d, s);
1511    __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1512    __ br(Assembler::HS, nooverlap_target);
1513
1514    if (is_oop) {
1515      __ push(RegSet::of(d, count), sp);
1516      // no registers are destroyed by this call
1517      gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1518    }
1519    copy_memory(aligned, s, d, count, rscratch1, -size);
1520    if (is_oop) {
1521      __ pop(RegSet::of(d, count), sp);
1522      if (VerifyOops)
1523        verify_oop_array(size, d, count, r16);
1524      __ sub(count, count, 1); // make an inclusive end pointer
1525      __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1526      gen_write_ref_array_post_barrier(d, count, rscratch1);
1527    }
1528    __ leave();
1529    __ mov(r0, zr); // return 0
1530    __ ret(lr);
1531#ifdef BUILTIN_SIM
1532    {
1533      AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1534      sim->notifyCompile(const_cast<char*>(name), start);
1535    }
1536#endif
1537    return start;
1538}
1539
1540  // Arguments:
1541  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1542  //             ignored
1543  //   name    - stub name string
1544  //
1545  // Inputs:
1546  //   c_rarg0   - source array address
1547  //   c_rarg1   - destination array address
1548  //   c_rarg2   - element count, treated as ssize_t, can be zero
1549  //
1550  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1551  // we let the hardware handle it.  The one to eight bytes within words,
1552  // dwords or qwords that span cache line boundaries will still be loaded
1553  // and stored atomically.
1554  //
1555  // Side Effects:
1556  //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1557  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1558  // we let the hardware handle it.  The one to eight bytes within words,
1559  // dwords or qwords that span cache line boundaries will still be loaded
1560  // and stored atomically.
1561  //
1562  // Side Effects:
1563  //   disjoint_byte_copy_entry is set to the no-overlap entry point
1564  //   used by generate_conjoint_byte_copy().
1565  //
1566  address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1567    const bool not_oop = false;
1568    return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1569  }
1570
1571  // Arguments:
1572  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1573  //             ignored
1574  //   name    - stub name string
1575  //
1576  // Inputs:
1577  //   c_rarg0   - source array address
1578  //   c_rarg1   - destination array address
1579  //   c_rarg2   - element count, treated as ssize_t, can be zero
1580  //
1581  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1582  // we let the hardware handle it.  The one to eight bytes within words,
1583  // dwords or qwords that span cache line boundaries will still be loaded
1584  // and stored atomically.
1585  //
1586  address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1587                                      address* entry, const char *name) {
1588    const bool not_oop = false;
1589    return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1590  }
1591
1592  // Arguments:
1593  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1594  //             ignored
1595  //   name    - stub name string
1596  //
1597  // Inputs:
1598  //   c_rarg0   - source array address
1599  //   c_rarg1   - destination array address
1600  //   c_rarg2   - element count, treated as ssize_t, can be zero
1601  //
1602  // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1603  // let the hardware handle it.  The two or four words within dwords
1604  // or qwords that span cache line boundaries will still be loaded
1605  // and stored atomically.
1606  //
1607  // Side Effects:
1608  //   disjoint_short_copy_entry is set to the no-overlap entry point
1609  //   used by generate_conjoint_short_copy().
1610  //
1611  address generate_disjoint_short_copy(bool aligned,
1612                                       address* entry, const char *name) {
1613    const bool not_oop = false;
1614    return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1615  }
1616
1617  // Arguments:
1618  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1619  //             ignored
1620  //   name    - stub name string
1621  //
1622  // Inputs:
1623  //   c_rarg0   - source array address
1624  //   c_rarg1   - destination array address
1625  //   c_rarg2   - element count, treated as ssize_t, can be zero
1626  //
1627  // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1628  // let the hardware handle it.  The two or four words within dwords
1629  // or qwords that span cache line boundaries will still be loaded
1630  // and stored atomically.
1631  //
1632  address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1633                                       address *entry, const char *name) {
1634    const bool not_oop = false;
1635    return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1636
1637  }
1638  // Arguments:
1639  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1640  //             ignored
1641  //   name    - stub name string
1642  //
1643  // Inputs:
1644  //   c_rarg0   - source array address
1645  //   c_rarg1   - destination array address
1646  //   c_rarg2   - element count, treated as ssize_t, can be zero
1647  //
1648  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1649  // the hardware handle it.  The two dwords within qwords that span
1650  // cache line boundaries will still be loaded and stored atomicly.
1651  //
1652  // Side Effects:
1653  //   disjoint_int_copy_entry is set to the no-overlap entry point
1654  //   used by generate_conjoint_int_oop_copy().
1655  //
1656  address generate_disjoint_int_copy(bool aligned, address *entry,
1657                                         const char *name, bool dest_uninitialized = false) {
1658    const bool not_oop = false;
1659    return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1660  }
1661
1662  // Arguments:
1663  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1664  //             ignored
1665  //   name    - stub name string
1666  //
1667  // Inputs:
1668  //   c_rarg0   - source array address
1669  //   c_rarg1   - destination array address
1670  //   c_rarg2   - element count, treated as ssize_t, can be zero
1671  //
1672  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1673  // the hardware handle it.  The two dwords within qwords that span
1674  // cache line boundaries will still be loaded and stored atomicly.
1675  //
1676  address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1677                                     address *entry, const char *name,
1678                                     bool dest_uninitialized = false) {
1679    const bool not_oop = false;
1680    return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1681  }
1682
1683
1684  // Arguments:
1685  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1686  //             ignored
1687  //   name    - stub name string
1688  //
1689  // Inputs:
1690  //   c_rarg0   - source array address
1691  //   c_rarg1   - destination array address
1692  //   c_rarg2   - element count, treated as size_t, can be zero
1693  //
1694  // Side Effects:
1695  //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1696  //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1697  //
1698  address generate_disjoint_long_copy(bool aligned, address *entry,
1699                                          const char *name, bool dest_uninitialized = false) {
1700    const bool not_oop = false;
1701    return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1702  }
1703
1704  // Arguments:
1705  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1706  //             ignored
1707  //   name    - stub name string
1708  //
1709  // Inputs:
1710  //   c_rarg0   - source array address
1711  //   c_rarg1   - destination array address
1712  //   c_rarg2   - element count, treated as size_t, can be zero
1713  //
1714  address generate_conjoint_long_copy(bool aligned,
1715                                      address nooverlap_target, address *entry,
1716                                      const char *name, bool dest_uninitialized = false) {
1717    const bool not_oop = false;
1718    return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1719  }
1720
1721  // Arguments:
1722  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1723  //             ignored
1724  //   name    - stub name string
1725  //
1726  // Inputs:
1727  //   c_rarg0   - source array address
1728  //   c_rarg1   - destination array address
1729  //   c_rarg2   - element count, treated as size_t, can be zero
1730  //
1731  // Side Effects:
1732  //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1733  //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1734  //
1735  address generate_disjoint_oop_copy(bool aligned, address *entry,
1736                                     const char *name, bool dest_uninitialized) {
1737    const bool is_oop = true;
1738    const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1739    return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1740  }
1741
1742  // Arguments:
1743  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1744  //             ignored
1745  //   name    - stub name string
1746  //
1747  // Inputs:
1748  //   c_rarg0   - source array address
1749  //   c_rarg1   - destination array address
1750  //   c_rarg2   - element count, treated as size_t, can be zero
1751  //
1752  address generate_conjoint_oop_copy(bool aligned,
1753                                     address nooverlap_target, address *entry,
1754                                     const char *name, bool dest_uninitialized) {
1755    const bool is_oop = true;
1756    const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1757    return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1758                                  name, dest_uninitialized);
1759  }
1760
1761
1762  // Helper for generating a dynamic type check.
1763  // Smashes rscratch1.
1764  void generate_type_check(Register sub_klass,
1765                           Register super_check_offset,
1766                           Register super_klass,
1767                           Label& L_success) {
1768    assert_different_registers(sub_klass, super_check_offset, super_klass);
1769
1770    BLOCK_COMMENT("type_check:");
1771
1772    Label L_miss;
1773
1774    __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1775                                     super_check_offset);
1776    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1777
1778    // Fall through on failure!
1779    __ BIND(L_miss);
1780  }
1781
1782  //
1783  //  Generate checkcasting array copy stub
1784  //
1785  //  Input:
1786  //    c_rarg0   - source array address
1787  //    c_rarg1   - destination array address
1788  //    c_rarg2   - element count, treated as ssize_t, can be zero
1789  //    c_rarg3   - size_t ckoff (super_check_offset)
1790  //    c_rarg4   - oop ckval (super_klass)
1791  //
1792  //  Output:
1793  //    r0 ==  0  -  success
1794  //    r0 == -1^K - failure, where K is partial transfer count
1795  //
1796  address generate_checkcast_copy(const char *name, address *entry,
1797                                  bool dest_uninitialized = false) {
1798
1799    Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1800
1801    // Input registers (after setup_arg_regs)
1802    const Register from        = c_rarg0;   // source array address
1803    const Register to          = c_rarg1;   // destination array address
1804    const Register count       = c_rarg2;   // elementscount
1805    const Register ckoff       = c_rarg3;   // super_check_offset
1806    const Register ckval       = c_rarg4;   // super_klass
1807
1808    // Registers used as temps (r18, r19, r20 are save-on-entry)
1809    const Register count_save  = r21;       // orig elementscount
1810    const Register start_to    = r20;       // destination array start address
1811    const Register copied_oop  = r18;       // actual oop copied
1812    const Register r19_klass   = r19;       // oop._klass
1813
1814    //---------------------------------------------------------------
1815    // Assembler stub will be used for this call to arraycopy
1816    // if the two arrays are subtypes of Object[] but the
1817    // destination array type is not equal to or a supertype
1818    // of the source type.  Each element must be separately
1819    // checked.
1820
1821    assert_different_registers(from, to, count, ckoff, ckval, start_to,
1822                               copied_oop, r19_klass, count_save);
1823
1824    __ align(CodeEntryAlignment);
1825    StubCodeMark mark(this, "StubRoutines", name);
1826    address start = __ pc();
1827
1828    __ enter(); // required for proper stackwalking of RuntimeStub frame
1829
1830#ifdef ASSERT
1831    // caller guarantees that the arrays really are different
1832    // otherwise, we would have to make conjoint checks
1833    { Label L;
1834      array_overlap_test(L, TIMES_OOP);
1835      __ stop("checkcast_copy within a single array");
1836      __ bind(L);
1837    }
1838#endif //ASSERT
1839
1840    // Caller of this entry point must set up the argument registers.
1841    if (entry != NULL) {
1842      *entry = __ pc();
1843      BLOCK_COMMENT("Entry:");
1844    }
1845
1846     // Empty array:  Nothing to do.
1847    __ cbz(count, L_done);
1848
1849    __ push(RegSet::of(r18, r19, r20, r21), sp);
1850
1851#ifdef ASSERT
1852    BLOCK_COMMENT("assert consistent ckoff/ckval");
1853    // The ckoff and ckval must be mutually consistent,
1854    // even though caller generates both.
1855    { Label L;
1856      int sco_offset = in_bytes(Klass::super_check_offset_offset());
1857      __ ldrw(start_to, Address(ckval, sco_offset));
1858      __ cmpw(ckoff, start_to);
1859      __ br(Assembler::EQ, L);
1860      __ stop("super_check_offset inconsistent");
1861      __ bind(L);
1862    }
1863#endif //ASSERT
1864
1865    gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1866
1867    // save the original count
1868    __ mov(count_save, count);
1869
1870    // Copy from low to high addresses
1871    __ mov(start_to, to);              // Save destination array start address
1872    __ b(L_load_element);
1873
1874    // ======== begin loop ========
1875    // (Loop is rotated; its entry is L_load_element.)
1876    // Loop control:
1877    //   for (; count != 0; count--) {
1878    //     copied_oop = load_heap_oop(from++);
1879    //     ... generate_type_check ...;
1880    //     store_heap_oop(to++, copied_oop);
1881    //   }
1882    __ align(OptoLoopAlignment);
1883
1884    __ BIND(L_store_element);
1885    __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1886    __ sub(count, count, 1);
1887    __ cbz(count, L_do_card_marks);
1888
1889    // ======== loop entry is here ========
1890    __ BIND(L_load_element);
1891    __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1892    __ cbz(copied_oop, L_store_element);
1893
1894    __ load_klass(r19_klass, copied_oop);// query the object klass
1895    generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1896    // ======== end loop ========
1897
1898    // It was a real error; we must depend on the caller to finish the job.
1899    // Register count = remaining oops, count_orig = total oops.
1900    // Emit GC store barriers for the oops we have copied and report
1901    // their number to the caller.
1902
1903    __ subs(count, count_save, count);     // K = partially copied oop count
1904    __ eon(count, count, zr);                   // report (-1^K) to caller
1905    __ br(Assembler::EQ, L_done_pop);
1906
1907    __ BIND(L_do_card_marks);
1908    __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1909    gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1910
1911    __ bind(L_done_pop);
1912    __ pop(RegSet::of(r18, r19, r20, r21), sp);
1913    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1914
1915    __ bind(L_done);
1916    __ mov(r0, count);
1917    __ leave();
1918    __ ret(lr);
1919
1920    return start;
1921  }
1922
1923  // Perform range checks on the proposed arraycopy.
1924  // Kills temp, but nothing else.
1925  // Also, clean the sign bits of src_pos and dst_pos.
1926  void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1927                              Register src_pos, // source position (c_rarg1)
1928                              Register dst,     // destination array oo (c_rarg2)
1929                              Register dst_pos, // destination position (c_rarg3)
1930                              Register length,
1931                              Register temp,
1932                              Label& L_failed) {
1933    BLOCK_COMMENT("arraycopy_range_checks:");
1934
1935    assert_different_registers(rscratch1, temp);
1936
1937    //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1938    __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1939    __ addw(temp, length, src_pos);
1940    __ cmpw(temp, rscratch1);
1941    __ br(Assembler::HI, L_failed);
1942
1943    //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1944    __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1945    __ addw(temp, length, dst_pos);
1946    __ cmpw(temp, rscratch1);
1947    __ br(Assembler::HI, L_failed);
1948
1949    // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1950    __ movw(src_pos, src_pos);
1951    __ movw(dst_pos, dst_pos);
1952
1953    BLOCK_COMMENT("arraycopy_range_checks done");
1954  }
1955
1956  // These stubs get called from some dumb test routine.
1957  // I'll write them properly when they're called from
1958  // something that's actually doing something.
1959  static void fake_arraycopy_stub(address src, address dst, int count) {
1960    assert(count == 0, "huh?");
1961  }
1962
1963
1964  //
1965  //  Generate 'unsafe' array copy stub
1966  //  Though just as safe as the other stubs, it takes an unscaled
1967  //  size_t argument instead of an element count.
1968  //
1969  //  Input:
1970  //    c_rarg0   - source array address
1971  //    c_rarg1   - destination array address
1972  //    c_rarg2   - byte count, treated as ssize_t, can be zero
1973  //
1974  // Examines the alignment of the operands and dispatches
1975  // to a long, int, short, or byte copy loop.
1976  //
1977  address generate_unsafe_copy(const char *name,
1978                               address byte_copy_entry,
1979                               address short_copy_entry,
1980                               address int_copy_entry,
1981                               address long_copy_entry) {
1982    Label L_long_aligned, L_int_aligned, L_short_aligned;
1983    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1984
1985    __ align(CodeEntryAlignment);
1986    StubCodeMark mark(this, "StubRoutines", name);
1987    address start = __ pc();
1988    __ enter(); // required for proper stackwalking of RuntimeStub frame
1989
1990    // bump this on entry, not on exit:
1991    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1992
1993    __ orr(rscratch1, s, d);
1994    __ orr(rscratch1, rscratch1, count);
1995
1996    __ andr(rscratch1, rscratch1, BytesPerLong-1);
1997    __ cbz(rscratch1, L_long_aligned);
1998    __ andr(rscratch1, rscratch1, BytesPerInt-1);
1999    __ cbz(rscratch1, L_int_aligned);
2000    __ tbz(rscratch1, 0, L_short_aligned);
2001    __ b(RuntimeAddress(byte_copy_entry));
2002
2003    __ BIND(L_short_aligned);
2004    __ lsr(count, count, LogBytesPerShort);  // size => short_count
2005    __ b(RuntimeAddress(short_copy_entry));
2006    __ BIND(L_int_aligned);
2007    __ lsr(count, count, LogBytesPerInt);    // size => int_count
2008    __ b(RuntimeAddress(int_copy_entry));
2009    __ BIND(L_long_aligned);
2010    __ lsr(count, count, LogBytesPerLong);   // size => long_count
2011    __ b(RuntimeAddress(long_copy_entry));
2012
2013    return start;
2014  }
2015
2016  //
2017  //  Generate generic array copy stubs
2018  //
2019  //  Input:
2020  //    c_rarg0    -  src oop
2021  //    c_rarg1    -  src_pos (32-bits)
2022  //    c_rarg2    -  dst oop
2023  //    c_rarg3    -  dst_pos (32-bits)
2024  //    c_rarg4    -  element count (32-bits)
2025  //
2026  //  Output:
2027  //    r0 ==  0  -  success
2028  //    r0 == -1^K - failure, where K is partial transfer count
2029  //
2030  address generate_generic_copy(const char *name,
2031                                address byte_copy_entry, address short_copy_entry,
2032                                address int_copy_entry, address oop_copy_entry,
2033                                address long_copy_entry, address checkcast_copy_entry) {
2034
2035    Label L_failed, L_failed_0, L_objArray;
2036    Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2037
2038    // Input registers
2039    const Register src        = c_rarg0;  // source array oop
2040    const Register src_pos    = c_rarg1;  // source position
2041    const Register dst        = c_rarg2;  // destination array oop
2042    const Register dst_pos    = c_rarg3;  // destination position
2043    const Register length     = c_rarg4;
2044
2045    StubCodeMark mark(this, "StubRoutines", name);
2046
2047    __ align(CodeEntryAlignment);
2048    address start = __ pc();
2049
2050    __ enter(); // required for proper stackwalking of RuntimeStub frame
2051
2052    // bump this on entry, not on exit:
2053    inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2054
2055    //-----------------------------------------------------------------------
2056    // Assembler stub will be used for this call to arraycopy
2057    // if the following conditions are met:
2058    //
2059    // (1) src and dst must not be null.
2060    // (2) src_pos must not be negative.
2061    // (3) dst_pos must not be negative.
2062    // (4) length  must not be negative.
2063    // (5) src klass and dst klass should be the same and not NULL.
2064    // (6) src and dst should be arrays.
2065    // (7) src_pos + length must not exceed length of src.
2066    // (8) dst_pos + length must not exceed length of dst.
2067    //
2068
2069    //  if (src == NULL) return -1;
2070    __ cbz(src, L_failed);
2071
2072    //  if (src_pos < 0) return -1;
2073    __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2074
2075    //  if (dst == NULL) return -1;
2076    __ cbz(dst, L_failed);
2077
2078    //  if (dst_pos < 0) return -1;
2079    __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2080
2081    // registers used as temp
2082    const Register scratch_length    = r16; // elements count to copy
2083    const Register scratch_src_klass = r17; // array klass
2084    const Register lh                = r18; // layout helper
2085
2086    //  if (length < 0) return -1;
2087    __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2088    __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2089
2090    __ load_klass(scratch_src_klass, src);
2091#ifdef ASSERT
2092    //  assert(src->klass() != NULL);
2093    {
2094      BLOCK_COMMENT("assert klasses not null {");
2095      Label L1, L2;
2096      __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2097      __ bind(L1);
2098      __ stop("broken null klass");
2099      __ bind(L2);
2100      __ load_klass(rscratch1, dst);
2101      __ cbz(rscratch1, L1);     // this would be broken also
2102      BLOCK_COMMENT("} assert klasses not null done");
2103    }
2104#endif
2105
2106    // Load layout helper (32-bits)
2107    //
2108    //  |array_tag|     | header_size | element_type |     |log2_element_size|
2109    // 32        30    24            16              8     2                 0
2110    //
2111    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2112    //
2113
2114    const int lh_offset = in_bytes(Klass::layout_helper_offset());
2115
2116    // Handle objArrays completely differently...
2117    const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2118    __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2119    __ movw(rscratch1, objArray_lh);
2120    __ eorw(rscratch2, lh, rscratch1);
2121    __ cbzw(rscratch2, L_objArray);
2122
2123    //  if (src->klass() != dst->klass()) return -1;
2124    __ load_klass(rscratch2, dst);
2125    __ eor(rscratch2, rscratch2, scratch_src_klass);
2126    __ cbnz(rscratch2, L_failed);
2127
2128    //  if (!src->is_Array()) return -1;
2129    __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2130
2131    // At this point, it is known to be a typeArray (array_tag 0x3).
2132#ifdef ASSERT
2133    {
2134      BLOCK_COMMENT("assert primitive array {");
2135      Label L;
2136      __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2137      __ cmpw(lh, rscratch2);
2138      __ br(Assembler::GE, L);
2139      __ stop("must be a primitive array");
2140      __ bind(L);
2141      BLOCK_COMMENT("} assert primitive array done");
2142    }
2143#endif
2144
2145    arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2146                           rscratch2, L_failed);
2147
2148    // TypeArrayKlass
2149    //
2150    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2151    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2152    //
2153
2154    const Register rscratch1_offset = rscratch1;    // array offset
2155    const Register r18_elsize = lh; // element size
2156
2157    __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2158           exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2159    __ add(src, src, rscratch1_offset);           // src array offset
2160    __ add(dst, dst, rscratch1_offset);           // dst array offset
2161    BLOCK_COMMENT("choose copy loop based on element size");
2162
2163    // next registers should be set before the jump to corresponding stub
2164    const Register from     = c_rarg0;  // source array address
2165    const Register to       = c_rarg1;  // destination array address
2166    const Register count    = c_rarg2;  // elements count
2167
2168    // 'from', 'to', 'count' registers should be set in such order
2169    // since they are the same as 'src', 'src_pos', 'dst'.
2170
2171    assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2172
2173    // The possible values of elsize are 0-3, i.e. exact_log2(element
2174    // size in bytes).  We do a simple bitwise binary search.
2175  __ BIND(L_copy_bytes);
2176    __ tbnz(r18_elsize, 1, L_copy_ints);
2177    __ tbnz(r18_elsize, 0, L_copy_shorts);
2178    __ lea(from, Address(src, src_pos));// src_addr
2179    __ lea(to,   Address(dst, dst_pos));// dst_addr
2180    __ movw(count, scratch_length); // length
2181    __ b(RuntimeAddress(byte_copy_entry));
2182
2183  __ BIND(L_copy_shorts);
2184    __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2185    __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2186    __ movw(count, scratch_length); // length
2187    __ b(RuntimeAddress(short_copy_entry));
2188
2189  __ BIND(L_copy_ints);
2190    __ tbnz(r18_elsize, 0, L_copy_longs);
2191    __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2192    __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2193    __ movw(count, scratch_length); // length
2194    __ b(RuntimeAddress(int_copy_entry));
2195
2196  __ BIND(L_copy_longs);
2197#ifdef ASSERT
2198    {
2199      BLOCK_COMMENT("assert long copy {");
2200      Label L;
2201      __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2202      __ cmpw(r18_elsize, LogBytesPerLong);
2203      __ br(Assembler::EQ, L);
2204      __ stop("must be long copy, but elsize is wrong");
2205      __ bind(L);
2206      BLOCK_COMMENT("} assert long copy done");
2207    }
2208#endif
2209    __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2210    __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2211    __ movw(count, scratch_length); // length
2212    __ b(RuntimeAddress(long_copy_entry));
2213
2214    // ObjArrayKlass
2215  __ BIND(L_objArray);
2216    // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2217
2218    Label L_plain_copy, L_checkcast_copy;
2219    //  test array classes for subtyping
2220    __ load_klass(r18, dst);
2221    __ cmp(scratch_src_klass, r18); // usual case is exact equality
2222    __ br(Assembler::NE, L_checkcast_copy);
2223
2224    // Identically typed arrays can be copied without element-wise checks.
2225    arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2226                           rscratch2, L_failed);
2227
2228    __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2229    __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2230    __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2231    __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2232    __ movw(count, scratch_length); // length
2233  __ BIND(L_plain_copy);
2234    __ b(RuntimeAddress(oop_copy_entry));
2235
2236  __ BIND(L_checkcast_copy);
2237    // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2238    {
2239      // Before looking at dst.length, make sure dst is also an objArray.
2240      __ ldrw(rscratch1, Address(r18, lh_offset));
2241      __ movw(rscratch2, objArray_lh);
2242      __ eorw(rscratch1, rscratch1, rscratch2);
2243      __ cbnzw(rscratch1, L_failed);
2244
2245      // It is safe to examine both src.length and dst.length.
2246      arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2247                             r18, L_failed);
2248
2249      const Register rscratch2_dst_klass = rscratch2;
2250      __ load_klass(rscratch2_dst_klass, dst); // reload
2251
2252      // Marshal the base address arguments now, freeing registers.
2253      __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2254      __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2255      __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2256      __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2257      __ movw(count, length);           // length (reloaded)
2258      Register sco_temp = c_rarg3;      // this register is free now
2259      assert_different_registers(from, to, count, sco_temp,
2260                                 rscratch2_dst_klass, scratch_src_klass);
2261      // assert_clean_int(count, sco_temp);
2262
2263      // Generate the type check.
2264      const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2265      __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2266      // assert_clean_int(sco_temp, r18);
2267      generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2268
2269      // Fetch destination element klass from the ObjArrayKlass header.
2270      int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2271      __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2272      __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2273
2274      // the checkcast_copy loop needs two extra arguments:
2275      assert(c_rarg3 == sco_temp, "#3 already in place");
2276      // Set up arguments for checkcast_copy_entry.
2277      __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2278      __ b(RuntimeAddress(checkcast_copy_entry));
2279    }
2280
2281  __ BIND(L_failed);
2282    __ mov(r0, -1);
2283    __ leave();   // required for proper stackwalking of RuntimeStub frame
2284    __ ret(lr);
2285
2286    return start;
2287  }
2288
2289  //
2290  // Generate stub for array fill. If "aligned" is true, the
2291  // "to" address is assumed to be heapword aligned.
2292  //
2293  // Arguments for generated stub:
2294  //   to:    c_rarg0
2295  //   value: c_rarg1
2296  //   count: c_rarg2 treated as signed
2297  //
2298  address generate_fill(BasicType t, bool aligned, const char *name) {
2299    __ align(CodeEntryAlignment);
2300    StubCodeMark mark(this, "StubRoutines", name);
2301    address start = __ pc();
2302
2303    BLOCK_COMMENT("Entry:");
2304
2305    const Register to        = c_rarg0;  // source array address
2306    const Register value     = c_rarg1;  // value
2307    const Register count     = c_rarg2;  // elements count
2308
2309    const Register bz_base = r10;        // base for block_zero routine
2310    const Register cnt_words = r11;      // temp register
2311
2312    __ enter();
2313
2314    Label L_fill_elements, L_exit1;
2315
2316    int shift = -1;
2317    switch (t) {
2318      case T_BYTE:
2319        shift = 0;
2320        __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2321        __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2322        __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2323        __ br(Assembler::LO, L_fill_elements);
2324        break;
2325      case T_SHORT:
2326        shift = 1;
2327        __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2328        __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2329        __ br(Assembler::LO, L_fill_elements);
2330        break;
2331      case T_INT:
2332        shift = 2;
2333        __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2334        __ br(Assembler::LO, L_fill_elements);
2335        break;
2336      default: ShouldNotReachHere();
2337    }
2338
2339    // Align source address at 8 bytes address boundary.
2340    Label L_skip_align1, L_skip_align2, L_skip_align4;
2341    if (!aligned) {
2342      switch (t) {
2343        case T_BYTE:
2344          // One byte misalignment happens only for byte arrays.
2345          __ tbz(to, 0, L_skip_align1);
2346          __ strb(value, Address(__ post(to, 1)));
2347          __ subw(count, count, 1);
2348          __ bind(L_skip_align1);
2349          // Fallthrough
2350        case T_SHORT:
2351          // Two bytes misalignment happens only for byte and short (char) arrays.
2352          __ tbz(to, 1, L_skip_align2);
2353          __ strh(value, Address(__ post(to, 2)));
2354          __ subw(count, count, 2 >> shift);
2355          __ bind(L_skip_align2);
2356          // Fallthrough
2357        case T_INT:
2358          // Align to 8 bytes, we know we are 4 byte aligned to start.
2359          __ tbz(to, 2, L_skip_align4);
2360          __ strw(value, Address(__ post(to, 4)));
2361          __ subw(count, count, 4 >> shift);
2362          __ bind(L_skip_align4);
2363          break;
2364        default: ShouldNotReachHere();
2365      }
2366    }
2367
2368    //
2369    //  Fill large chunks
2370    //
2371    __ lsrw(cnt_words, count, 3 - shift); // number of words
2372    __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2373    __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2374    if (UseBlockZeroing) {
2375      Label non_block_zeroing, rest;
2376      // If the fill value is zero we can use the fast zero_words().
2377      __ cbnz(value, non_block_zeroing);
2378      __ mov(bz_base, to);
2379      __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2380      __ zero_words(bz_base, cnt_words);
2381      __ b(rest);
2382      __ bind(non_block_zeroing);
2383      __ fill_words(to, cnt_words, value);
2384      __ bind(rest);
2385    } else {
2386      __ fill_words(to, cnt_words, value);
2387    }
2388
2389    // Remaining count is less than 8 bytes. Fill it by a single store.
2390    // Note that the total length is no less than 8 bytes.
2391    if (t == T_BYTE || t == T_SHORT) {
2392      Label L_exit1;
2393      __ cbzw(count, L_exit1);
2394      __ add(to, to, count, Assembler::LSL, shift); // points to the end
2395      __ str(value, Address(to, -8));    // overwrite some elements
2396      __ bind(L_exit1);
2397      __ leave();
2398      __ ret(lr);
2399    }
2400
2401    // Handle copies less than 8 bytes.
2402    Label L_fill_2, L_fill_4, L_exit2;
2403    __ bind(L_fill_elements);
2404    switch (t) {
2405      case T_BYTE:
2406        __ tbz(count, 0, L_fill_2);
2407        __ strb(value, Address(__ post(to, 1)));
2408        __ bind(L_fill_2);
2409        __ tbz(count, 1, L_fill_4);
2410        __ strh(value, Address(__ post(to, 2)));
2411        __ bind(L_fill_4);
2412        __ tbz(count, 2, L_exit2);
2413        __ strw(value, Address(to));
2414        break;
2415      case T_SHORT:
2416        __ tbz(count, 0, L_fill_4);
2417        __ strh(value, Address(__ post(to, 2)));
2418        __ bind(L_fill_4);
2419        __ tbz(count, 1, L_exit2);
2420        __ strw(value, Address(to));
2421        break;
2422      case T_INT:
2423        __ cbzw(count, L_exit2);
2424        __ strw(value, Address(to));
2425        break;
2426      default: ShouldNotReachHere();
2427    }
2428    __ bind(L_exit2);
2429    __ leave();
2430    __ ret(lr);
2431    return start;
2432  }
2433
2434  void generate_arraycopy_stubs() {
2435    address entry;
2436    address entry_jbyte_arraycopy;
2437    address entry_jshort_arraycopy;
2438    address entry_jint_arraycopy;
2439    address entry_oop_arraycopy;
2440    address entry_jlong_arraycopy;
2441    address entry_checkcast_arraycopy;
2442
2443    generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2444    generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2445
2446    StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2447
2448    //*** jbyte
2449    // Always need aligned and unaligned versions
2450    StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2451                                                                                  "jbyte_disjoint_arraycopy");
2452    StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2453                                                                                  &entry_jbyte_arraycopy,
2454                                                                                  "jbyte_arraycopy");
2455    StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2456                                                                                  "arrayof_jbyte_disjoint_arraycopy");
2457    StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2458                                                                                  "arrayof_jbyte_arraycopy");
2459
2460    //*** jshort
2461    // Always need aligned and unaligned versions
2462    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2463                                                                                    "jshort_disjoint_arraycopy");
2464    StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2465                                                                                    &entry_jshort_arraycopy,
2466                                                                                    "jshort_arraycopy");
2467    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2468                                                                                    "arrayof_jshort_disjoint_arraycopy");
2469    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2470                                                                                    "arrayof_jshort_arraycopy");
2471
2472    //*** jint
2473    // Aligned versions
2474    StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2475                                                                                "arrayof_jint_disjoint_arraycopy");
2476    StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2477                                                                                "arrayof_jint_arraycopy");
2478    // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2479    // entry_jint_arraycopy always points to the unaligned version
2480    StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2481                                                                                "jint_disjoint_arraycopy");
2482    StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2483                                                                                &entry_jint_arraycopy,
2484                                                                                "jint_arraycopy");
2485
2486    //*** jlong
2487    // It is always aligned
2488    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2489                                                                                  "arrayof_jlong_disjoint_arraycopy");
2490    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2491                                                                                  "arrayof_jlong_arraycopy");
2492    StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2493    StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2494
2495    //*** oops
2496    {
2497      // With compressed oops we need unaligned versions; notice that
2498      // we overwrite entry_oop_arraycopy.
2499      bool aligned = !UseCompressedOops;
2500
2501      StubRoutines::_arrayof_oop_disjoint_arraycopy
2502        = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2503                                     /*dest_uninitialized*/false);
2504      StubRoutines::_arrayof_oop_arraycopy
2505        = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2506                                     /*dest_uninitialized*/false);
2507      // Aligned versions without pre-barriers
2508      StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2509        = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2510                                     /*dest_uninitialized*/true);
2511      StubRoutines::_arrayof_oop_arraycopy_uninit
2512        = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2513                                     /*dest_uninitialized*/true);
2514    }
2515
2516    StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2517    StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2518    StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2519    StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2520
2521    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2522    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2523                                                                        /*dest_uninitialized*/true);
2524
2525    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2526                                                              entry_jbyte_arraycopy,
2527                                                              entry_jshort_arraycopy,
2528                                                              entry_jint_arraycopy,
2529                                                              entry_jlong_arraycopy);
2530
2531    StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2532                                                               entry_jbyte_arraycopy,
2533                                                               entry_jshort_arraycopy,
2534                                                               entry_jint_arraycopy,
2535                                                               entry_oop_arraycopy,
2536                                                               entry_jlong_arraycopy,
2537                                                               entry_checkcast_arraycopy);
2538
2539    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2540    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2541    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2542    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2543    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2544    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2545  }
2546
2547  void generate_math_stubs() { Unimplemented(); }
2548
2549  // Arguments:
2550  //
2551  // Inputs:
2552  //   c_rarg0   - source byte array address
2553  //   c_rarg1   - destination byte array address
2554  //   c_rarg2   - K (key) in little endian int array
2555  //
2556  address generate_aescrypt_encryptBlock() {
2557    __ align(CodeEntryAlignment);
2558    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2559
2560    Label L_doLast;
2561
2562    const Register from        = c_rarg0;  // source array address
2563    const Register to          = c_rarg1;  // destination array address
2564    const Register key         = c_rarg2;  // key array address
2565    const Register keylen      = rscratch1;
2566
2567    address start = __ pc();
2568    __ enter();
2569
2570    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2571
2572    __ ld1(v0, __ T16B, from); // get 16 bytes of input
2573
2574    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2575    __ rev32(v1, __ T16B, v1);
2576    __ rev32(v2, __ T16B, v2);
2577    __ rev32(v3, __ T16B, v3);
2578    __ rev32(v4, __ T16B, v4);
2579    __ aese(v0, v1);
2580    __ aesmc(v0, v0);
2581    __ aese(v0, v2);
2582    __ aesmc(v0, v0);
2583    __ aese(v0, v3);
2584    __ aesmc(v0, v0);
2585    __ aese(v0, v4);
2586    __ aesmc(v0, v0);
2587
2588    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2589    __ rev32(v1, __ T16B, v1);
2590    __ rev32(v2, __ T16B, v2);
2591    __ rev32(v3, __ T16B, v3);
2592    __ rev32(v4, __ T16B, v4);
2593    __ aese(v0, v1);
2594    __ aesmc(v0, v0);
2595    __ aese(v0, v2);
2596    __ aesmc(v0, v0);
2597    __ aese(v0, v3);
2598    __ aesmc(v0, v0);
2599    __ aese(v0, v4);
2600    __ aesmc(v0, v0);
2601
2602    __ ld1(v1, v2, __ T16B, __ post(key, 32));
2603    __ rev32(v1, __ T16B, v1);
2604    __ rev32(v2, __ T16B, v2);
2605
2606    __ cmpw(keylen, 44);
2607    __ br(Assembler::EQ, L_doLast);
2608
2609    __ aese(v0, v1);
2610    __ aesmc(v0, v0);
2611    __ aese(v0, v2);
2612    __ aesmc(v0, v0);
2613
2614    __ ld1(v1, v2, __ T16B, __ post(key, 32));
2615    __ rev32(v1, __ T16B, v1);
2616    __ rev32(v2, __ T16B, v2);
2617
2618    __ cmpw(keylen, 52);
2619    __ br(Assembler::EQ, L_doLast);
2620
2621    __ aese(v0, v1);
2622    __ aesmc(v0, v0);
2623    __ aese(v0, v2);
2624    __ aesmc(v0, v0);
2625
2626    __ ld1(v1, v2, __ T16B, __ post(key, 32));
2627    __ rev32(v1, __ T16B, v1);
2628    __ rev32(v2, __ T16B, v2);
2629
2630    __ BIND(L_doLast);
2631
2632    __ aese(v0, v1);
2633    __ aesmc(v0, v0);
2634    __ aese(v0, v2);
2635
2636    __ ld1(v1, __ T16B, key);
2637    __ rev32(v1, __ T16B, v1);
2638    __ eor(v0, __ T16B, v0, v1);
2639
2640    __ st1(v0, __ T16B, to);
2641
2642    __ mov(r0, 0);
2643
2644    __ leave();
2645    __ ret(lr);
2646
2647    return start;
2648  }
2649
2650  // Arguments:
2651  //
2652  // Inputs:
2653  //   c_rarg0   - source byte array address
2654  //   c_rarg1   - destination byte array address
2655  //   c_rarg2   - K (key) in little endian int array
2656  //
2657  address generate_aescrypt_decryptBlock() {
2658    assert(UseAES, "need AES instructions and misaligned SSE support");
2659    __ align(CodeEntryAlignment);
2660    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2661    Label L_doLast;
2662
2663    const Register from        = c_rarg0;  // source array address
2664    const Register to          = c_rarg1;  // destination array address
2665    const Register key         = c_rarg2;  // key array address
2666    const Register keylen      = rscratch1;
2667
2668    address start = __ pc();
2669    __ enter(); // required for proper stackwalking of RuntimeStub frame
2670
2671    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2672
2673    __ ld1(v0, __ T16B, from); // get 16 bytes of input
2674
2675    __ ld1(v5, __ T16B, __ post(key, 16));
2676    __ rev32(v5, __ T16B, v5);
2677
2678    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2679    __ rev32(v1, __ T16B, v1);
2680    __ rev32(v2, __ T16B, v2);
2681    __ rev32(v3, __ T16B, v3);
2682    __ rev32(v4, __ T16B, v4);
2683    __ aesd(v0, v1);
2684    __ aesimc(v0, v0);
2685    __ aesd(v0, v2);
2686    __ aesimc(v0, v0);
2687    __ aesd(v0, v3);
2688    __ aesimc(v0, v0);
2689    __ aesd(v0, v4);
2690    __ aesimc(v0, v0);
2691
2692    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2693    __ rev32(v1, __ T16B, v1);
2694    __ rev32(v2, __ T16B, v2);
2695    __ rev32(v3, __ T16B, v3);
2696    __ rev32(v4, __ T16B, v4);
2697    __ aesd(v0, v1);
2698    __ aesimc(v0, v0);
2699    __ aesd(v0, v2);
2700    __ aesimc(v0, v0);
2701    __ aesd(v0, v3);
2702    __ aesimc(v0, v0);
2703    __ aesd(v0, v4);
2704    __ aesimc(v0, v0);
2705
2706    __ ld1(v1, v2, __ T16B, __ post(key, 32));
2707    __ rev32(v1, __ T16B, v1);
2708    __ rev32(v2, __ T16B, v2);
2709
2710    __ cmpw(keylen, 44);
2711    __ br(Assembler::EQ, L_doLast);
2712
2713    __ aesd(v0, v1);
2714    __ aesimc(v0, v0);
2715    __ aesd(v0, v2);
2716    __ aesimc(v0, v0);
2717
2718    __ ld1(v1, v2, __ T16B, __ post(key, 32));
2719    __ rev32(v1, __ T16B, v1);
2720    __ rev32(v2, __ T16B, v2);
2721
2722    __ cmpw(keylen, 52);
2723    __ br(Assembler::EQ, L_doLast);
2724
2725    __ aesd(v0, v1);
2726    __ aesimc(v0, v0);
2727    __ aesd(v0, v2);
2728    __ aesimc(v0, v0);
2729
2730    __ ld1(v1, v2, __ T16B, __ post(key, 32));
2731    __ rev32(v1, __ T16B, v1);
2732    __ rev32(v2, __ T16B, v2);
2733
2734    __ BIND(L_doLast);
2735
2736    __ aesd(v0, v1);
2737    __ aesimc(v0, v0);
2738    __ aesd(v0, v2);
2739
2740    __ eor(v0, __ T16B, v0, v5);
2741
2742    __ st1(v0, __ T16B, to);
2743
2744    __ mov(r0, 0);
2745
2746    __ leave();
2747    __ ret(lr);
2748
2749    return start;
2750  }
2751
2752  // Arguments:
2753  //
2754  // Inputs:
2755  //   c_rarg0   - source byte array address
2756  //   c_rarg1   - destination byte array address
2757  //   c_rarg2   - K (key) in little endian int array
2758  //   c_rarg3   - r vector byte array address
2759  //   c_rarg4   - input length
2760  //
2761  // Output:
2762  //   x0        - input length
2763  //
2764  address generate_cipherBlockChaining_encryptAESCrypt() {
2765    assert(UseAES, "need AES instructions and misaligned SSE support");
2766    __ align(CodeEntryAlignment);
2767    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2768
2769    Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2770
2771    const Register from        = c_rarg0;  // source array address
2772    const Register to          = c_rarg1;  // destination array address
2773    const Register key         = c_rarg2;  // key array address
2774    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2775                                           // and left with the results of the last encryption block
2776    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2777    const Register keylen      = rscratch1;
2778
2779    address start = __ pc();
2780
2781      __ enter();
2782
2783      __ movw(rscratch2, len_reg);
2784
2785      __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2786
2787      __ ld1(v0, __ T16B, rvec);
2788
2789      __ cmpw(keylen, 52);
2790      __ br(Assembler::CC, L_loadkeys_44);
2791      __ br(Assembler::EQ, L_loadkeys_52);
2792
2793      __ ld1(v17, v18, __ T16B, __ post(key, 32));
2794      __ rev32(v17, __ T16B, v17);
2795      __ rev32(v18, __ T16B, v18);
2796    __ BIND(L_loadkeys_52);
2797      __ ld1(v19, v20, __ T16B, __ post(key, 32));
2798      __ rev32(v19, __ T16B, v19);
2799      __ rev32(v20, __ T16B, v20);
2800    __ BIND(L_loadkeys_44);
2801      __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2802      __ rev32(v21, __ T16B, v21);
2803      __ rev32(v22, __ T16B, v22);
2804      __ rev32(v23, __ T16B, v23);
2805      __ rev32(v24, __ T16B, v24);
2806      __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2807      __ rev32(v25, __ T16B, v25);
2808      __ rev32(v26, __ T16B, v26);
2809      __ rev32(v27, __ T16B, v27);
2810      __ rev32(v28, __ T16B, v28);
2811      __ ld1(v29, v30, v31, __ T16B, key);
2812      __ rev32(v29, __ T16B, v29);
2813      __ rev32(v30, __ T16B, v30);
2814      __ rev32(v31, __ T16B, v31);
2815
2816    __ BIND(L_aes_loop);
2817      __ ld1(v1, __ T16B, __ post(from, 16));
2818      __ eor(v0, __ T16B, v0, v1);
2819
2820      __ br(Assembler::CC, L_rounds_44);
2821      __ br(Assembler::EQ, L_rounds_52);
2822
2823      __ aese(v0, v17); __ aesmc(v0, v0);
2824      __ aese(v0, v18); __ aesmc(v0, v0);
2825    __ BIND(L_rounds_52);
2826      __ aese(v0, v19); __ aesmc(v0, v0);
2827      __ aese(v0, v20); __ aesmc(v0, v0);
2828    __ BIND(L_rounds_44);
2829      __ aese(v0, v21); __ aesmc(v0, v0);
2830      __ aese(v0, v22); __ aesmc(v0, v0);
2831      __ aese(v0, v23); __ aesmc(v0, v0);
2832      __ aese(v0, v24); __ aesmc(v0, v0);
2833      __ aese(v0, v25); __ aesmc(v0, v0);
2834      __ aese(v0, v26); __ aesmc(v0, v0);
2835      __ aese(v0, v27); __ aesmc(v0, v0);
2836      __ aese(v0, v28); __ aesmc(v0, v0);
2837      __ aese(v0, v29); __ aesmc(v0, v0);
2838      __ aese(v0, v30);
2839      __ eor(v0, __ T16B, v0, v31);
2840
2841      __ st1(v0, __ T16B, __ post(to, 16));
2842
2843      __ subw(len_reg, len_reg, 16);
2844      __ cbnzw(len_reg, L_aes_loop);
2845
2846      __ st1(v0, __ T16B, rvec);
2847
2848      __ mov(r0, rscratch2);
2849
2850      __ leave();
2851      __ ret(lr);
2852
2853      return start;
2854  }
2855
2856  // Arguments:
2857  //
2858  // Inputs:
2859  //   c_rarg0   - source byte array address
2860  //   c_rarg1   - destination byte array address
2861  //   c_rarg2   - K (key) in little endian int array
2862  //   c_rarg3   - r vector byte array address
2863  //   c_rarg4   - input length
2864  //
2865  // Output:
2866  //   r0        - input length
2867  //
2868  address generate_cipherBlockChaining_decryptAESCrypt() {
2869    assert(UseAES, "need AES instructions and misaligned SSE support");
2870    __ align(CodeEntryAlignment);
2871    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2872
2873    Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2874
2875    const Register from        = c_rarg0;  // source array address
2876    const Register to          = c_rarg1;  // destination array address
2877    const Register key         = c_rarg2;  // key array address
2878    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2879                                           // and left with the results of the last encryption block
2880    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2881    const Register keylen      = rscratch1;
2882
2883    address start = __ pc();
2884
2885      __ enter();
2886
2887      __ movw(rscratch2, len_reg);
2888
2889      __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2890
2891      __ ld1(v2, __ T16B, rvec);
2892
2893      __ ld1(v31, __ T16B, __ post(key, 16));
2894      __ rev32(v31, __ T16B, v31);
2895
2896      __ cmpw(keylen, 52);
2897      __ br(Assembler::CC, L_loadkeys_44);
2898      __ br(Assembler::EQ, L_loadkeys_52);
2899
2900      __ ld1(v17, v18, __ T16B, __ post(key, 32));
2901      __ rev32(v17, __ T16B, v17);
2902      __ rev32(v18, __ T16B, v18);
2903    __ BIND(L_loadkeys_52);
2904      __ ld1(v19, v20, __ T16B, __ post(key, 32));
2905      __ rev32(v19, __ T16B, v19);
2906      __ rev32(v20, __ T16B, v20);
2907    __ BIND(L_loadkeys_44);
2908      __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2909      __ rev32(v21, __ T16B, v21);
2910      __ rev32(v22, __ T16B, v22);
2911      __ rev32(v23, __ T16B, v23);
2912      __ rev32(v24, __ T16B, v24);
2913      __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2914      __ rev32(v25, __ T16B, v25);
2915      __ rev32(v26, __ T16B, v26);
2916      __ rev32(v27, __ T16B, v27);
2917      __ rev32(v28, __ T16B, v28);
2918      __ ld1(v29, v30, __ T16B, key);
2919      __ rev32(v29, __ T16B, v29);
2920      __ rev32(v30, __ T16B, v30);
2921
2922    __ BIND(L_aes_loop);
2923      __ ld1(v0, __ T16B, __ post(from, 16));
2924      __ orr(v1, __ T16B, v0, v0);
2925
2926      __ br(Assembler::CC, L_rounds_44);
2927      __ br(Assembler::EQ, L_rounds_52);
2928
2929      __ aesd(v0, v17); __ aesimc(v0, v0);
2930      __ aesd(v0, v18); __ aesimc(v0, v0);
2931    __ BIND(L_rounds_52);
2932      __ aesd(v0, v19); __ aesimc(v0, v0);
2933      __ aesd(v0, v20); __ aesimc(v0, v0);
2934    __ BIND(L_rounds_44);
2935      __ aesd(v0, v21); __ aesimc(v0, v0);
2936      __ aesd(v0, v22); __ aesimc(v0, v0);
2937      __ aesd(v0, v23); __ aesimc(v0, v0);
2938      __ aesd(v0, v24); __ aesimc(v0, v0);
2939      __ aesd(v0, v25); __ aesimc(v0, v0);
2940      __ aesd(v0, v26); __ aesimc(v0, v0);
2941      __ aesd(v0, v27); __ aesimc(v0, v0);
2942      __ aesd(v0, v28); __ aesimc(v0, v0);
2943      __ aesd(v0, v29); __ aesimc(v0, v0);
2944      __ aesd(v0, v30);
2945      __ eor(v0, __ T16B, v0, v31);
2946      __ eor(v0, __ T16B, v0, v2);
2947
2948      __ st1(v0, __ T16B, __ post(to, 16));
2949      __ orr(v2, __ T16B, v1, v1);
2950
2951      __ subw(len_reg, len_reg, 16);
2952      __ cbnzw(len_reg, L_aes_loop);
2953
2954      __ st1(v2, __ T16B, rvec);
2955
2956      __ mov(r0, rscratch2);
2957
2958      __ leave();
2959      __ ret(lr);
2960
2961    return start;
2962  }
2963
2964  // Arguments:
2965  //
2966  // Inputs:
2967  //   c_rarg0   - byte[]  source+offset
2968  //   c_rarg1   - int[]   SHA.state
2969  //   c_rarg2   - int     offset
2970  //   c_rarg3   - int     limit
2971  //
2972  address generate_sha1_implCompress(bool multi_block, const char *name) {
2973    __ align(CodeEntryAlignment);
2974    StubCodeMark mark(this, "StubRoutines", name);
2975    address start = __ pc();
2976
2977    Register buf   = c_rarg0;
2978    Register state = c_rarg1;
2979    Register ofs   = c_rarg2;
2980    Register limit = c_rarg3;
2981
2982    Label keys;
2983    Label sha1_loop;
2984
2985    // load the keys into v0..v3
2986    __ adr(rscratch1, keys);
2987    __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2988    // load 5 words state into v6, v7
2989    __ ldrq(v6, Address(state, 0));
2990    __ ldrs(v7, Address(state, 16));
2991
2992
2993    __ BIND(sha1_loop);
2994    // load 64 bytes of data into v16..v19
2995    __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2996    __ rev32(v16, __ T16B, v16);
2997    __ rev32(v17, __ T16B, v17);
2998    __ rev32(v18, __ T16B, v18);
2999    __ rev32(v19, __ T16B, v19);
3000
3001    // do the sha1
3002    __ addv(v4, __ T4S, v16, v0);
3003    __ orr(v20, __ T16B, v6, v6);
3004
3005    FloatRegister d0 = v16;
3006    FloatRegister d1 = v17;
3007    FloatRegister d2 = v18;
3008    FloatRegister d3 = v19;
3009
3010    for (int round = 0; round < 20; round++) {
3011      FloatRegister tmp1 = (round & 1) ? v4 : v5;
3012      FloatRegister tmp2 = (round & 1) ? v21 : v22;
3013      FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3014      FloatRegister tmp4 = (round & 1) ? v5 : v4;
3015      FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3016
3017      if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3018      if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3019      __ sha1h(tmp2, __ T4S, v20);
3020      if (round < 5)
3021        __ sha1c(v20, __ T4S, tmp3, tmp4);
3022      else if (round < 10 || round >= 15)
3023        __ sha1p(v20, __ T4S, tmp3, tmp4);
3024      else
3025        __ sha1m(v20, __ T4S, tmp3, tmp4);
3026      if (round < 16) __ sha1su1(d0, __ T4S, d3);
3027
3028      tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3029    }
3030
3031    __ addv(v7, __ T2S, v7, v21);
3032    __ addv(v6, __ T4S, v6, v20);
3033
3034    if (multi_block) {
3035      __ add(ofs, ofs, 64);
3036      __ cmp(ofs, limit);
3037      __ br(Assembler::LE, sha1_loop);
3038      __ mov(c_rarg0, ofs); // return ofs
3039    }
3040
3041    __ strq(v6, Address(state, 0));
3042    __ strs(v7, Address(state, 16));
3043
3044    __ ret(lr);
3045
3046    __ bind(keys);
3047    __ emit_int32(0x5a827999);
3048    __ emit_int32(0x6ed9eba1);
3049    __ emit_int32(0x8f1bbcdc);
3050    __ emit_int32(0xca62c1d6);
3051
3052    return start;
3053  }
3054
3055
3056  // Arguments:
3057  //
3058  // Inputs:
3059  //   c_rarg0   - byte[]  source+offset
3060  //   c_rarg1   - int[]   SHA.state
3061  //   c_rarg2   - int     offset
3062  //   c_rarg3   - int     limit
3063  //
3064  address generate_sha256_implCompress(bool multi_block, const char *name) {
3065    static const uint32_t round_consts[64] = {
3066      0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3067      0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3068      0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3069      0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3070      0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3071      0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3072      0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3073      0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3074      0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3075      0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3076      0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3077      0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3078      0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3079      0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3080      0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3081      0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3082    };
3083    __ align(CodeEntryAlignment);
3084    StubCodeMark mark(this, "StubRoutines", name);
3085    address start = __ pc();
3086
3087    Register buf   = c_rarg0;
3088    Register state = c_rarg1;
3089    Register ofs   = c_rarg2;
3090    Register limit = c_rarg3;
3091
3092    Label sha1_loop;
3093
3094    __ stpd(v8, v9, __ pre(sp, -32));
3095    __ stpd(v10, v11, Address(sp, 16));
3096
3097// dga == v0
3098// dgb == v1
3099// dg0 == v2
3100// dg1 == v3
3101// dg2 == v4
3102// t0 == v6
3103// t1 == v7
3104
3105    // load 16 keys to v16..v31
3106    __ lea(rscratch1, ExternalAddress((address)round_consts));
3107    __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3108    __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3109    __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3110    __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3111
3112    // load 8 words (256 bits) state
3113    __ ldpq(v0, v1, state);
3114
3115    __ BIND(sha1_loop);
3116    // load 64 bytes of data into v8..v11
3117    __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3118    __ rev32(v8, __ T16B, v8);
3119    __ rev32(v9, __ T16B, v9);
3120    __ rev32(v10, __ T16B, v10);
3121    __ rev32(v11, __ T16B, v11);
3122
3123    __ addv(v6, __ T4S, v8, v16);
3124    __ orr(v2, __ T16B, v0, v0);
3125    __ orr(v3, __ T16B, v1, v1);
3126
3127    FloatRegister d0 = v8;
3128    FloatRegister d1 = v9;
3129    FloatRegister d2 = v10;
3130    FloatRegister d3 = v11;
3131
3132
3133    for (int round = 0; round < 16; round++) {
3134      FloatRegister tmp1 = (round & 1) ? v6 : v7;
3135      FloatRegister tmp2 = (round & 1) ? v7 : v6;
3136      FloatRegister tmp3 = (round & 1) ? v2 : v4;
3137      FloatRegister tmp4 = (round & 1) ? v4 : v2;
3138
3139      if (round < 12) __ sha256su0(d0, __ T4S, d1);
3140       __ orr(v4, __ T16B, v2, v2);
3141      if (round < 15)
3142        __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3143      __ sha256h(v2, __ T4S, v3, tmp2);
3144      __ sha256h2(v3, __ T4S, v4, tmp2);
3145      if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3146
3147      tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3148    }
3149
3150    __ addv(v0, __ T4S, v0, v2);
3151    __ addv(v1, __ T4S, v1, v3);
3152
3153    if (multi_block) {
3154      __ add(ofs, ofs, 64);
3155      __ cmp(ofs, limit);
3156      __ br(Assembler::LE, sha1_loop);
3157      __ mov(c_rarg0, ofs); // return ofs
3158    }
3159
3160    __ ldpd(v10, v11, Address(sp, 16));
3161    __ ldpd(v8, v9, __ post(sp, 32));
3162
3163    __ stpq(v0, v1, state);
3164
3165    __ ret(lr);
3166
3167    return start;
3168  }
3169
3170#ifndef BUILTIN_SIM
3171  // Safefetch stubs.
3172  void generate_safefetch(const char* name, int size, address* entry,
3173                          address* fault_pc, address* continuation_pc) {
3174    // safefetch signatures:
3175    //   int      SafeFetch32(int*      adr, int      errValue);
3176    //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3177    //
3178    // arguments:
3179    //   c_rarg0 = adr
3180    //   c_rarg1 = errValue
3181    //
3182    // result:
3183    //   PPC_RET  = *adr or errValue
3184
3185    StubCodeMark mark(this, "StubRoutines", name);
3186
3187    // Entry point, pc or function descriptor.
3188    *entry = __ pc();
3189
3190    // Load *adr into c_rarg1, may fault.
3191    *fault_pc = __ pc();
3192    switch (size) {
3193      case 4:
3194        // int32_t
3195        __ ldrw(c_rarg1, Address(c_rarg0, 0));
3196        break;
3197      case 8:
3198        // int64_t
3199        __ ldr(c_rarg1, Address(c_rarg0, 0));
3200        break;
3201      default:
3202        ShouldNotReachHere();
3203    }
3204
3205    // return errValue or *adr
3206    *continuation_pc = __ pc();
3207    __ mov(r0, c_rarg1);
3208    __ ret(lr);
3209  }
3210#endif
3211
3212  /**
3213   *  Arguments:
3214   *
3215   * Inputs:
3216   *   c_rarg0   - int crc
3217   *   c_rarg1   - byte* buf
3218   *   c_rarg2   - int length
3219   *
3220   * Ouput:
3221   *       rax   - int crc result
3222   */
3223  address generate_updateBytesCRC32() {
3224    assert(UseCRC32Intrinsics, "what are we doing here?");
3225
3226    __ align(CodeEntryAlignment);
3227    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3228
3229    address start = __ pc();
3230
3231    const Register crc   = c_rarg0;  // crc
3232    const Register buf   = c_rarg1;  // source java byte array address
3233    const Register len   = c_rarg2;  // length
3234    const Register table0 = c_rarg3; // crc_table address
3235    const Register table1 = c_rarg4;
3236    const Register table2 = c_rarg5;
3237    const Register table3 = c_rarg6;
3238    const Register tmp3 = c_rarg7;
3239
3240    BLOCK_COMMENT("Entry:");
3241    __ enter(); // required for proper stackwalking of RuntimeStub frame
3242
3243    __ kernel_crc32(crc, buf, len,
3244              table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3245
3246    __ leave(); // required for proper stackwalking of RuntimeStub frame
3247    __ ret(lr);
3248
3249    return start;
3250  }
3251
3252  /**
3253   *  Arguments:
3254   *
3255   * Inputs:
3256   *   c_rarg0   - int crc
3257   *   c_rarg1   - byte* buf
3258   *   c_rarg2   - int length
3259   *   c_rarg3   - int* table
3260   *
3261   * Ouput:
3262   *       r0   - int crc result
3263   */
3264  address generate_updateBytesCRC32C() {
3265    assert(UseCRC32CIntrinsics, "what are we doing here?");
3266
3267    __ align(CodeEntryAlignment);
3268    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3269
3270    address start = __ pc();
3271
3272    const Register crc   = c_rarg0;  // crc
3273    const Register buf   = c_rarg1;  // source java byte array address
3274    const Register len   = c_rarg2;  // length
3275    const Register table0 = c_rarg3; // crc_table address
3276    const Register table1 = c_rarg4;
3277    const Register table2 = c_rarg5;
3278    const Register table3 = c_rarg6;
3279    const Register tmp3 = c_rarg7;
3280
3281    BLOCK_COMMENT("Entry:");
3282    __ enter(); // required for proper stackwalking of RuntimeStub frame
3283
3284    __ kernel_crc32c(crc, buf, len,
3285              table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3286
3287    __ leave(); // required for proper stackwalking of RuntimeStub frame
3288    __ ret(lr);
3289
3290    return start;
3291  }
3292
3293  /***
3294   *  Arguments:
3295   *
3296   *  Inputs:
3297   *   c_rarg0   - int   adler
3298   *   c_rarg1   - byte* buff
3299   *   c_rarg2   - int   len
3300   *
3301   * Output:
3302   *   c_rarg0   - int adler result
3303   */
3304  address generate_updateBytesAdler32() {
3305    __ align(CodeEntryAlignment);
3306    StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3307    address start = __ pc();
3308
3309    Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3310
3311    // Aliases
3312    Register adler  = c_rarg0;
3313    Register s1     = c_rarg0;
3314    Register s2     = c_rarg3;
3315    Register buff   = c_rarg1;
3316    Register len    = c_rarg2;
3317    Register nmax  = r4;
3318    Register base = r5;
3319    Register count = r6;
3320    Register temp0 = rscratch1;
3321    Register temp1 = rscratch2;
3322    Register temp2 = r7;
3323
3324    // Max number of bytes we can process before having to take the mod
3325    // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3326    unsigned long BASE = 0xfff1;
3327    unsigned long NMAX = 0x15B0;
3328
3329    __ mov(base, BASE);
3330    __ mov(nmax, NMAX);
3331
3332    // s1 is initialized to the lower 16 bits of adler
3333    // s2 is initialized to the upper 16 bits of adler
3334    __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3335    __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3336
3337    // The pipelined loop needs at least 16 elements for 1 iteration
3338    // It does check this, but it is more effective to skip to the cleanup loop
3339    __ cmp(len, 16);
3340    __ br(Assembler::HS, L_nmax);
3341    __ cbz(len, L_combine);
3342
3343    __ bind(L_simple_by1_loop);
3344    __ ldrb(temp0, Address(__ post(buff, 1)));
3345    __ add(s1, s1, temp0);
3346    __ add(s2, s2, s1);
3347    __ subs(len, len, 1);
3348    __ br(Assembler::HI, L_simple_by1_loop);
3349
3350    // s1 = s1 % BASE
3351    __ subs(temp0, s1, base);
3352    __ csel(s1, temp0, s1, Assembler::HS);
3353
3354    // s2 = s2 % BASE
3355    __ lsr(temp0, s2, 16);
3356    __ lsl(temp1, temp0, 4);
3357    __ sub(temp1, temp1, temp0);
3358    __ add(s2, temp1, s2, ext::uxth);
3359
3360    __ subs(temp0, s2, base);
3361    __ csel(s2, temp0, s2, Assembler::HS);
3362
3363    __ b(L_combine);
3364
3365    __ bind(L_nmax);
3366    __ subs(len, len, nmax);
3367    __ sub(count, nmax, 16);
3368    __ br(Assembler::LO, L_by16);
3369
3370    __ bind(L_nmax_loop);
3371
3372    __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3373
3374    __ add(s1, s1, temp0, ext::uxtb);
3375    __ ubfx(temp2, temp0, 8, 8);
3376    __ add(s2, s2, s1);
3377    __ add(s1, s1, temp2);
3378    __ ubfx(temp2, temp0, 16, 8);
3379    __ add(s2, s2, s1);
3380    __ add(s1, s1, temp2);
3381    __ ubfx(temp2, temp0, 24, 8);
3382    __ add(s2, s2, s1);
3383    __ add(s1, s1, temp2);
3384    __ ubfx(temp2, temp0, 32, 8);
3385    __ add(s2, s2, s1);
3386    __ add(s1, s1, temp2);
3387    __ ubfx(temp2, temp0, 40, 8);
3388    __ add(s2, s2, s1);
3389    __ add(s1, s1, temp2);
3390    __ ubfx(temp2, temp0, 48, 8);
3391    __ add(s2, s2, s1);
3392    __ add(s1, s1, temp2);
3393    __ add(s2, s2, s1);
3394    __ add(s1, s1, temp0, Assembler::LSR, 56);
3395    __ add(s2, s2, s1);
3396
3397    __ add(s1, s1, temp1, ext::uxtb);
3398    __ ubfx(temp2, temp1, 8, 8);
3399    __ add(s2, s2, s1);
3400    __ add(s1, s1, temp2);
3401    __ ubfx(temp2, temp1, 16, 8);
3402    __ add(s2, s2, s1);
3403    __ add(s1, s1, temp2);
3404    __ ubfx(temp2, temp1, 24, 8);
3405    __ add(s2, s2, s1);
3406    __ add(s1, s1, temp2);
3407    __ ubfx(temp2, temp1, 32, 8);
3408    __ add(s2, s2, s1);
3409    __ add(s1, s1, temp2);
3410    __ ubfx(temp2, temp1, 40, 8);
3411    __ add(s2, s2, s1);
3412    __ add(s1, s1, temp2);
3413    __ ubfx(temp2, temp1, 48, 8);
3414    __ add(s2, s2, s1);
3415    __ add(s1, s1, temp2);
3416    __ add(s2, s2, s1);
3417    __ add(s1, s1, temp1, Assembler::LSR, 56);
3418    __ add(s2, s2, s1);
3419
3420    __ subs(count, count, 16);
3421    __ br(Assembler::HS, L_nmax_loop);
3422
3423    // s1 = s1 % BASE
3424    __ lsr(temp0, s1, 16);
3425    __ lsl(temp1, temp0, 4);
3426    __ sub(temp1, temp1, temp0);
3427    __ add(temp1, temp1, s1, ext::uxth);
3428
3429    __ lsr(temp0, temp1, 16);
3430    __ lsl(s1, temp0, 4);
3431    __ sub(s1, s1, temp0);
3432    __ add(s1, s1, temp1, ext:: uxth);
3433
3434    __ subs(temp0, s1, base);
3435    __ csel(s1, temp0, s1, Assembler::HS);
3436
3437    // s2 = s2 % BASE
3438    __ lsr(temp0, s2, 16);
3439    __ lsl(temp1, temp0, 4);
3440    __ sub(temp1, temp1, temp0);
3441    __ add(temp1, temp1, s2, ext::uxth);
3442
3443    __ lsr(temp0, temp1, 16);
3444    __ lsl(s2, temp0, 4);
3445    __ sub(s2, s2, temp0);
3446    __ add(s2, s2, temp1, ext:: uxth);
3447
3448    __ subs(temp0, s2, base);
3449    __ csel(s2, temp0, s2, Assembler::HS);
3450
3451    __ subs(len, len, nmax);
3452    __ sub(count, nmax, 16);
3453    __ br(Assembler::HS, L_nmax_loop);
3454
3455    __ bind(L_by16);
3456    __ adds(len, len, count);
3457    __ br(Assembler::LO, L_by1);
3458
3459    __ bind(L_by16_loop);
3460
3461    __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3462
3463    __ add(s1, s1, temp0, ext::uxtb);
3464    __ ubfx(temp2, temp0, 8, 8);
3465    __ add(s2, s2, s1);
3466    __ add(s1, s1, temp2);
3467    __ ubfx(temp2, temp0, 16, 8);
3468    __ add(s2, s2, s1);
3469    __ add(s1, s1, temp2);
3470    __ ubfx(temp2, temp0, 24, 8);
3471    __ add(s2, s2, s1);
3472    __ add(s1, s1, temp2);
3473    __ ubfx(temp2, temp0, 32, 8);
3474    __ add(s2, s2, s1);
3475    __ add(s1, s1, temp2);
3476    __ ubfx(temp2, temp0, 40, 8);
3477    __ add(s2, s2, s1);
3478    __ add(s1, s1, temp2);
3479    __ ubfx(temp2, temp0, 48, 8);
3480    __ add(s2, s2, s1);
3481    __ add(s1, s1, temp2);
3482    __ add(s2, s2, s1);
3483    __ add(s1, s1, temp0, Assembler::LSR, 56);
3484    __ add(s2, s2, s1);
3485
3486    __ add(s1, s1, temp1, ext::uxtb);
3487    __ ubfx(temp2, temp1, 8, 8);
3488    __ add(s2, s2, s1);
3489    __ add(s1, s1, temp2);
3490    __ ubfx(temp2, temp1, 16, 8);
3491    __ add(s2, s2, s1);
3492    __ add(s1, s1, temp2);
3493    __ ubfx(temp2, temp1, 24, 8);
3494    __ add(s2, s2, s1);
3495    __ add(s1, s1, temp2);
3496    __ ubfx(temp2, temp1, 32, 8);
3497    __ add(s2, s2, s1);
3498    __ add(s1, s1, temp2);
3499    __ ubfx(temp2, temp1, 40, 8);
3500    __ add(s2, s2, s1);
3501    __ add(s1, s1, temp2);
3502    __ ubfx(temp2, temp1, 48, 8);
3503    __ add(s2, s2, s1);
3504    __ add(s1, s1, temp2);
3505    __ add(s2, s2, s1);
3506    __ add(s1, s1, temp1, Assembler::LSR, 56);
3507    __ add(s2, s2, s1);
3508
3509    __ subs(len, len, 16);
3510    __ br(Assembler::HS, L_by16_loop);
3511
3512    __ bind(L_by1);
3513    __ adds(len, len, 15);
3514    __ br(Assembler::LO, L_do_mod);
3515
3516    __ bind(L_by1_loop);
3517    __ ldrb(temp0, Address(__ post(buff, 1)));
3518    __ add(s1, temp0, s1);
3519    __ add(s2, s2, s1);
3520    __ subs(len, len, 1);
3521    __ br(Assembler::HS, L_by1_loop);
3522
3523    __ bind(L_do_mod);
3524    // s1 = s1 % BASE
3525    __ lsr(temp0, s1, 16);
3526    __ lsl(temp1, temp0, 4);
3527    __ sub(temp1, temp1, temp0);
3528    __ add(temp1, temp1, s1, ext::uxth);
3529
3530    __ lsr(temp0, temp1, 16);
3531    __ lsl(s1, temp0, 4);
3532    __ sub(s1, s1, temp0);
3533    __ add(s1, s1, temp1, ext:: uxth);
3534
3535    __ subs(temp0, s1, base);
3536    __ csel(s1, temp0, s1, Assembler::HS);
3537
3538    // s2 = s2 % BASE
3539    __ lsr(temp0, s2, 16);
3540    __ lsl(temp1, temp0, 4);
3541    __ sub(temp1, temp1, temp0);
3542    __ add(temp1, temp1, s2, ext::uxth);
3543
3544    __ lsr(temp0, temp1, 16);
3545    __ lsl(s2, temp0, 4);
3546    __ sub(s2, s2, temp0);
3547    __ add(s2, s2, temp1, ext:: uxth);
3548
3549    __ subs(temp0, s2, base);
3550    __ csel(s2, temp0, s2, Assembler::HS);
3551
3552    // Combine lower bits and higher bits
3553    __ bind(L_combine);
3554    __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3555
3556    __ ret(lr);
3557
3558    return start;
3559  }
3560
3561  /**
3562   *  Arguments:
3563   *
3564   *  Input:
3565   *    c_rarg0   - x address
3566   *    c_rarg1   - x length
3567   *    c_rarg2   - y address
3568   *    c_rarg3   - y lenth
3569   *    c_rarg4   - z address
3570   *    c_rarg5   - z length
3571   */
3572  address generate_multiplyToLen() {
3573    __ align(CodeEntryAlignment);
3574    StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3575
3576    address start = __ pc();
3577    const Register x     = r0;
3578    const Register xlen  = r1;
3579    const Register y     = r2;
3580    const Register ylen  = r3;
3581    const Register z     = r4;
3582    const Register zlen  = r5;
3583
3584    const Register tmp1  = r10;
3585    const Register tmp2  = r11;
3586    const Register tmp3  = r12;
3587    const Register tmp4  = r13;
3588    const Register tmp5  = r14;
3589    const Register tmp6  = r15;
3590    const Register tmp7  = r16;
3591
3592    BLOCK_COMMENT("Entry:");
3593    __ enter(); // required for proper stackwalking of RuntimeStub frame
3594    __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3595    __ leave(); // required for proper stackwalking of RuntimeStub frame
3596    __ ret(lr);
3597
3598    return start;
3599  }
3600
3601  void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3602                      FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3603                      FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3604    // Karatsuba multiplication performs a 128*128 -> 256-bit
3605    // multiplication in three 128-bit multiplications and a few
3606    // additions.
3607    //
3608    // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3609    // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3610    //
3611    // Inputs:
3612    //
3613    // A0 in a.d[0]     (subkey)
3614    // A1 in a.d[1]
3615    // (A1+A0) in a1_xor_a0.d[0]
3616    //
3617    // B0 in b.d[0]     (state)
3618    // B1 in b.d[1]
3619
3620    __ ext(tmp1, __ T16B, b, b, 0x08);
3621    __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3622    __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3623    __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3624    __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3625
3626    __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3627    __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3628    __ eor(tmp2, __ T16B, tmp2, tmp4);
3629    __ eor(tmp2, __ T16B, tmp2, tmp3);
3630
3631    // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3632    __ ins(result_hi, __ D, tmp2, 0, 1);
3633    __ ins(result_lo, __ D, tmp2, 1, 0);
3634  }
3635
3636  void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3637                    FloatRegister p, FloatRegister z, FloatRegister t1) {
3638    const FloatRegister t0 = result;
3639
3640    // The GCM field polynomial f is z^128 + p(z), where p =
3641    // z^7+z^2+z+1.
3642    //
3643    //    z^128 === -p(z)  (mod (z^128 + p(z)))
3644    //
3645    // so, given that the product we're reducing is
3646    //    a == lo + hi * z^128
3647    // substituting,
3648    //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3649    //
3650    // we reduce by multiplying hi by p(z) and subtracting the result
3651    // from (i.e. XORing it with) lo.  Because p has no nonzero high
3652    // bits we can do this with two 64-bit multiplications, lo*p and
3653    // hi*p.
3654
3655    __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3656    __ ext(t1, __ T16B, t0, z, 8);
3657    __ eor(hi, __ T16B, hi, t1);
3658    __ ext(t1, __ T16B, z, t0, 8);
3659    __ eor(lo, __ T16B, lo, t1);
3660    __ pmull(t0, __ T1Q, hi, p, __ T1D);
3661    __ eor(result, __ T16B, lo, t0);
3662  }
3663
3664  /**
3665   *  Arguments:
3666   *
3667   *  Input:
3668   *  c_rarg0   - current state address
3669   *  c_rarg1   - H key address
3670   *  c_rarg2   - data address
3671   *  c_rarg3   - number of blocks
3672   *
3673   *  Output:
3674   *  Updated state at c_rarg0
3675   */
3676  address generate_ghash_processBlocks() {
3677    // Bafflingly, GCM uses little-endian for the byte order, but
3678    // big-endian for the bit order.  For example, the polynomial 1 is
3679    // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3680    //
3681    // So, we must either reverse the bytes in each word and do
3682    // everything big-endian or reverse the bits in each byte and do
3683    // it little-endian.  On AArch64 it's more idiomatic to reverse
3684    // the bits in each byte (we have an instruction, RBIT, to do
3685    // that) and keep the data in little-endian bit order throught the
3686    // calculation, bit-reversing the inputs and outputs.
3687
3688    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3689    __ align(wordSize * 2);
3690    address p = __ pc();
3691    __ emit_int64(0x87);  // The low-order bits of the field
3692                          // polynomial (i.e. p = z^7+z^2+z+1)
3693                          // repeated in the low and high parts of a
3694                          // 128-bit vector
3695    __ emit_int64(0x87);
3696
3697    __ align(CodeEntryAlignment);
3698    address start = __ pc();
3699
3700    Register state   = c_rarg0;
3701    Register subkeyH = c_rarg1;
3702    Register data    = c_rarg2;
3703    Register blocks  = c_rarg3;
3704
3705    FloatRegister vzr = v30;
3706    __ eor(vzr, __ T16B, vzr, vzr); // zero register
3707
3708    __ ldrq(v0, Address(state));
3709    __ ldrq(v1, Address(subkeyH));
3710
3711    __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3712    __ rbit(v0, __ T16B, v0);
3713    __ rev64(v1, __ T16B, v1);
3714    __ rbit(v1, __ T16B, v1);
3715
3716    __ ldrq(v26, p);
3717
3718    __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3719    __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3720
3721    {
3722      Label L_ghash_loop;
3723      __ bind(L_ghash_loop);
3724
3725      __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3726                                                 // reversing each byte
3727      __ rbit(v2, __ T16B, v2);
3728      __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3729
3730      // Multiply state in v2 by subkey in v1
3731      ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3732                     /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3733                     /*temps*/v6, v20, v18, v21);
3734      // Reduce v7:v5 by the field polynomial
3735      ghash_reduce(v0, v5, v7, v26, vzr, v20);
3736
3737      __ sub(blocks, blocks, 1);
3738      __ cbnz(blocks, L_ghash_loop);
3739    }
3740
3741    // The bit-reversed result is at this point in v0
3742    __ rev64(v1, __ T16B, v0);
3743    __ rbit(v1, __ T16B, v1);
3744
3745    __ st1(v1, __ T16B, state);
3746    __ ret(lr);
3747
3748    return start;
3749  }
3750
3751  // Continuation point for throwing of implicit exceptions that are
3752  // not handled in the current activation. Fabricates an exception
3753  // oop and initiates normal exception dispatching in this
3754  // frame. Since we need to preserve callee-saved values (currently
3755  // only for C2, but done for C1 as well) we need a callee-saved oop
3756  // map and therefore have to make these stubs into RuntimeStubs
3757  // rather than BufferBlobs.  If the compiler needs all registers to
3758  // be preserved between the fault point and the exception handler
3759  // then it must assume responsibility for that in
3760  // AbstractCompiler::continuation_for_implicit_null_exception or
3761  // continuation_for_implicit_division_by_zero_exception. All other
3762  // implicit exceptions (e.g., NullPointerException or
3763  // AbstractMethodError on entry) are either at call sites or
3764  // otherwise assume that stack unwinding will be initiated, so
3765  // caller saved registers were assumed volatile in the compiler.
3766
3767#undef __
3768#define __ masm->
3769
3770  address generate_throw_exception(const char* name,
3771                                   address runtime_entry,
3772                                   Register arg1 = noreg,
3773                                   Register arg2 = noreg) {
3774    // Information about frame layout at time of blocking runtime call.
3775    // Note that we only have to preserve callee-saved registers since
3776    // the compilers are responsible for supplying a continuation point
3777    // if they expect all registers to be preserved.
3778    // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3779    enum layout {
3780      rfp_off = 0,
3781      rfp_off2,
3782      return_off,
3783      return_off2,
3784      framesize // inclusive of return address
3785    };
3786
3787    int insts_size = 512;
3788    int locs_size  = 64;
3789
3790    CodeBuffer code(name, insts_size, locs_size);
3791    OopMapSet* oop_maps  = new OopMapSet();
3792    MacroAssembler* masm = new MacroAssembler(&code);
3793
3794    address start = __ pc();
3795
3796    // This is an inlined and slightly modified version of call_VM
3797    // which has the ability to fetch the return PC out of
3798    // thread-local storage and also sets up last_Java_sp slightly
3799    // differently than the real call_VM
3800
3801    __ enter(); // Save FP and LR before call
3802
3803    assert(is_even(framesize/2), "sp not 16-byte aligned");
3804
3805    // lr and fp are already in place
3806    __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3807
3808    int frame_complete = __ pc() - start;
3809
3810    // Set up last_Java_sp and last_Java_fp
3811    address the_pc = __ pc();
3812    __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3813
3814    // Call runtime
3815    if (arg1 != noreg) {
3816      assert(arg2 != c_rarg1, "clobbered");
3817      __ mov(c_rarg1, arg1);
3818    }
3819    if (arg2 != noreg) {
3820      __ mov(c_rarg2, arg2);
3821    }
3822    __ mov(c_rarg0, rthread);
3823    BLOCK_COMMENT("call runtime_entry");
3824    __ mov(rscratch1, runtime_entry);
3825    __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3826
3827    // Generate oop map
3828    OopMap* map = new OopMap(framesize, 0);
3829
3830    oop_maps->add_gc_map(the_pc - start, map);
3831
3832    __ reset_last_Java_frame(true);
3833    __ maybe_isb();
3834
3835    __ leave();
3836
3837    // check for pending exceptions
3838#ifdef ASSERT
3839    Label L;
3840    __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3841    __ cbnz(rscratch1, L);
3842    __ should_not_reach_here();
3843    __ bind(L);
3844#endif // ASSERT
3845    __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3846
3847
3848    // codeBlob framesize is in words (not VMRegImpl::slot_size)
3849    RuntimeStub* stub =
3850      RuntimeStub::new_runtime_stub(name,
3851                                    &code,
3852                                    frame_complete,
3853                                    (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3854                                    oop_maps, false);
3855    return stub->entry_point();
3856  }
3857
3858  class MontgomeryMultiplyGenerator : public MacroAssembler {
3859
3860    Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3861      Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3862
3863    RegSet _toSave;
3864    bool _squaring;
3865
3866  public:
3867    MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3868      : MacroAssembler(as->code()), _squaring(squaring) {
3869
3870      // Register allocation
3871
3872      Register reg = c_rarg0;
3873      Pa_base = reg;       // Argument registers
3874      if (squaring)
3875        Pb_base = Pa_base;
3876      else
3877        Pb_base = ++reg;
3878      Pn_base = ++reg;
3879      Rlen= ++reg;
3880      inv = ++reg;
3881      Pm_base = ++reg;
3882
3883                          // Working registers:
3884      Ra =  ++reg;        // The current digit of a, b, n, and m.
3885      Rb =  ++reg;
3886      Rm =  ++reg;
3887      Rn =  ++reg;
3888
3889      Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3890      Pb =  ++reg;
3891      Pm =  ++reg;
3892      Pn =  ++reg;
3893
3894      t0 =  ++reg;        // Three registers which form a
3895      t1 =  ++reg;        // triple-precision accumuator.
3896      t2 =  ++reg;
3897
3898      Ri =  ++reg;        // Inner and outer loop indexes.
3899      Rj =  ++reg;
3900
3901      Rhi_ab = ++reg;     // Product registers: low and high parts
3902      Rlo_ab = ++reg;     // of a*b and m*n.
3903      Rhi_mn = ++reg;
3904      Rlo_mn = ++reg;
3905
3906      // r19 and up are callee-saved.
3907      _toSave = RegSet::range(r19, reg) + Pm_base;
3908    }
3909
3910  private:
3911    void save_regs() {
3912      push(_toSave, sp);
3913    }
3914
3915    void restore_regs() {
3916      pop(_toSave, sp);
3917    }
3918
3919    template <typename T>
3920    void unroll_2(Register count, T block) {
3921      Label loop, end, odd;
3922      tbnz(count, 0, odd);
3923      cbz(count, end);
3924      align(16);
3925      bind(loop);
3926      (this->*block)();
3927      bind(odd);
3928      (this->*block)();
3929      subs(count, count, 2);
3930      br(Assembler::GT, loop);
3931      bind(end);
3932    }
3933
3934    template <typename T>
3935    void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3936      Label loop, end, odd;
3937      tbnz(count, 0, odd);
3938      cbz(count, end);
3939      align(16);
3940      bind(loop);
3941      (this->*block)(d, s, tmp);
3942      bind(odd);
3943      (this->*block)(d, s, tmp);
3944      subs(count, count, 2);
3945      br(Assembler::GT, loop);
3946      bind(end);
3947    }
3948
3949    void pre1(RegisterOrConstant i) {
3950      block_comment("pre1");
3951      // Pa = Pa_base;
3952      // Pb = Pb_base + i;
3953      // Pm = Pm_base;
3954      // Pn = Pn_base + i;
3955      // Ra = *Pa;
3956      // Rb = *Pb;
3957      // Rm = *Pm;
3958      // Rn = *Pn;
3959      ldr(Ra, Address(Pa_base));
3960      ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3961      ldr(Rm, Address(Pm_base));
3962      ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3963      lea(Pa, Address(Pa_base));
3964      lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3965      lea(Pm, Address(Pm_base));
3966      lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3967
3968      // Zero the m*n result.
3969      mov(Rhi_mn, zr);
3970      mov(Rlo_mn, zr);
3971    }
3972
3973    // The core multiply-accumulate step of a Montgomery
3974    // multiplication.  The idea is to schedule operations as a
3975    // pipeline so that instructions with long latencies (loads and
3976    // multiplies) have time to complete before their results are
3977    // used.  This most benefits in-order implementations of the
3978    // architecture but out-of-order ones also benefit.
3979    void step() {
3980      block_comment("step");
3981      // MACC(Ra, Rb, t0, t1, t2);
3982      // Ra = *++Pa;
3983      // Rb = *--Pb;
3984      umulh(Rhi_ab, Ra, Rb);
3985      mul(Rlo_ab, Ra, Rb);
3986      ldr(Ra, pre(Pa, wordSize));
3987      ldr(Rb, pre(Pb, -wordSize));
3988      acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3989                                       // previous iteration.
3990      // MACC(Rm, Rn, t0, t1, t2);
3991      // Rm = *++Pm;
3992      // Rn = *--Pn;
3993      umulh(Rhi_mn, Rm, Rn);
3994      mul(Rlo_mn, Rm, Rn);
3995      ldr(Rm, pre(Pm, wordSize));
3996      ldr(Rn, pre(Pn, -wordSize));
3997      acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3998    }
3999
4000    void post1() {
4001      block_comment("post1");
4002
4003      // MACC(Ra, Rb, t0, t1, t2);
4004      // Ra = *++Pa;
4005      // Rb = *--Pb;
4006      umulh(Rhi_ab, Ra, Rb);
4007      mul(Rlo_ab, Ra, Rb);
4008      acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4009      acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4010
4011      // *Pm = Rm = t0 * inv;
4012      mul(Rm, t0, inv);
4013      str(Rm, Address(Pm));
4014
4015      // MACC(Rm, Rn, t0, t1, t2);
4016      // t0 = t1; t1 = t2; t2 = 0;
4017      umulh(Rhi_mn, Rm, Rn);
4018
4019#ifndef PRODUCT
4020      // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4021      {
4022        mul(Rlo_mn, Rm, Rn);
4023        add(Rlo_mn, t0, Rlo_mn);
4024        Label ok;
4025        cbz(Rlo_mn, ok); {
4026          stop("broken Montgomery multiply");
4027        } bind(ok);
4028      }
4029#endif
4030      // We have very carefully set things up so that
4031      // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4032      // the lower half of Rm * Rn because we know the result already:
4033      // it must be -t0.  t0 + (-t0) must generate a carry iff
4034      // t0 != 0.  So, rather than do a mul and an adds we just set
4035      // the carry flag iff t0 is nonzero.
4036      //
4037      // mul(Rlo_mn, Rm, Rn);
4038      // adds(zr, t0, Rlo_mn);
4039      subs(zr, t0, 1); // Set carry iff t0 is nonzero
4040      adcs(t0, t1, Rhi_mn);
4041      adc(t1, t2, zr);
4042      mov(t2, zr);
4043    }
4044
4045    void pre2(RegisterOrConstant i, RegisterOrConstant len) {
4046      block_comment("pre2");
4047      // Pa = Pa_base + i-len;
4048      // Pb = Pb_base + len;
4049      // Pm = Pm_base + i-len;
4050      // Pn = Pn_base + len;
4051
4052      if (i.is_register()) {
4053        sub(Rj, i.as_register(), len);
4054      } else {
4055        mov(Rj, i.as_constant());
4056        sub(Rj, Rj, len);
4057      }
4058      // Rj == i-len
4059
4060      lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
4061      lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
4062      lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4063      lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
4064
4065      // Ra = *++Pa;
4066      // Rb = *--Pb;
4067      // Rm = *++Pm;
4068      // Rn = *--Pn;
4069      ldr(Ra, pre(Pa, wordSize));
4070      ldr(Rb, pre(Pb, -wordSize));
4071      ldr(Rm, pre(Pm, wordSize));
4072      ldr(Rn, pre(Pn, -wordSize));
4073
4074      mov(Rhi_mn, zr);
4075      mov(Rlo_mn, zr);
4076    }
4077
4078    void post2(RegisterOrConstant i, RegisterOrConstant len) {
4079      block_comment("post2");
4080      if (i.is_constant()) {
4081        mov(Rj, i.as_constant()-len.as_constant());
4082      } else {
4083        sub(Rj, i.as_register(), len);
4084      }
4085
4086      adds(t0, t0, Rlo_mn); // The pending m*n, low part
4087
4088      // As soon as we know the least significant digit of our result,
4089      // store it.
4090      // Pm_base[i-len] = t0;
4091      str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4092
4093      // t0 = t1; t1 = t2; t2 = 0;
4094      adcs(t0, t1, Rhi_mn); // The pending m*n, high part
4095      adc(t1, t2, zr);
4096      mov(t2, zr);
4097    }
4098
4099    // A carry in t0 after Montgomery multiplication means that we
4100    // should subtract multiples of n from our result in m.  We'll
4101    // keep doing that until there is no carry.
4102    void normalize(RegisterOrConstant len) {
4103      block_comment("normalize");
4104      // while (t0)
4105      //   t0 = sub(Pm_base, Pn_base, t0, len);
4106      Label loop, post, again;
4107      Register cnt = t1, i = t2; // Re-use registers; we're done with them now
4108      cbz(t0, post); {
4109        bind(again); {
4110          mov(i, zr);
4111          mov(cnt, len);
4112          ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4113          ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4114          subs(zr, zr, zr); // set carry flag, i.e. no borrow
4115          align(16);
4116          bind(loop); {
4117            sbcs(Rm, Rm, Rn);
4118            str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4119            add(i, i, 1);
4120            ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4121            ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4122            sub(cnt, cnt, 1);
4123          } cbnz(cnt, loop);
4124          sbc(t0, t0, zr);
4125        } cbnz(t0, again);
4126      } bind(post);
4127    }
4128
4129    // Move memory at s to d, reversing words.
4130    //    Increments d to end of copied memory
4131    //    Destroys tmp1, tmp2
4132    //    Preserves len
4133    //    Leaves s pointing to the address which was in d at start
4134    void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4135      assert(tmp1 < r19 && tmp2 < r19, "register corruption");
4136
4137      lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
4138      mov(tmp1, len);
4139      unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4140      sub(s, d, len, ext::uxtw, LogBytesPerWord);
4141    }
4142    // where
4143    void reverse1(Register d, Register s, Register tmp) {
4144      ldr(tmp, pre(s, -wordSize));
4145      ror(tmp, tmp, 32);
4146      str(tmp, post(d, wordSize));
4147    }
4148
4149    void step_squaring() {
4150      // An extra ACC
4151      step();
4152      acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4153    }
4154
4155    void last_squaring(RegisterOrConstant i) {
4156      Label dont;
4157      // if ((i & 1) == 0) {
4158      tbnz(i.as_register(), 0, dont); {
4159        // MACC(Ra, Rb, t0, t1, t2);
4160        // Ra = *++Pa;
4161        // Rb = *--Pb;
4162        umulh(Rhi_ab, Ra, Rb);
4163        mul(Rlo_ab, Ra, Rb);
4164        acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4165      } bind(dont);
4166    }
4167
4168    void extra_step_squaring() {
4169      acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4170
4171      // MACC(Rm, Rn, t0, t1, t2);
4172      // Rm = *++Pm;
4173      // Rn = *--Pn;
4174      umulh(Rhi_mn, Rm, Rn);
4175      mul(Rlo_mn, Rm, Rn);
4176      ldr(Rm, pre(Pm, wordSize));
4177      ldr(Rn, pre(Pn, -wordSize));
4178    }
4179
4180    void post1_squaring() {
4181      acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4182
4183      // *Pm = Rm = t0 * inv;
4184      mul(Rm, t0, inv);
4185      str(Rm, Address(Pm));
4186
4187      // MACC(Rm, Rn, t0, t1, t2);
4188      // t0 = t1; t1 = t2; t2 = 0;
4189      umulh(Rhi_mn, Rm, Rn);
4190
4191#ifndef PRODUCT
4192      // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4193      {
4194        mul(Rlo_mn, Rm, Rn);
4195        add(Rlo_mn, t0, Rlo_mn);
4196        Label ok;
4197        cbz(Rlo_mn, ok); {
4198          stop("broken Montgomery multiply");
4199        } bind(ok);
4200      }
4201#endif
4202      // We have very carefully set things up so that
4203      // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4204      // the lower half of Rm * Rn because we know the result already:
4205      // it must be -t0.  t0 + (-t0) must generate a carry iff
4206      // t0 != 0.  So, rather than do a mul and an adds we just set
4207      // the carry flag iff t0 is nonzero.
4208      //
4209      // mul(Rlo_mn, Rm, Rn);
4210      // adds(zr, t0, Rlo_mn);
4211      subs(zr, t0, 1); // Set carry iff t0 is nonzero
4212      adcs(t0, t1, Rhi_mn);
4213      adc(t1, t2, zr);
4214      mov(t2, zr);
4215    }
4216
4217    void acc(Register Rhi, Register Rlo,
4218             Register t0, Register t1, Register t2) {
4219      adds(t0, t0, Rlo);
4220      adcs(t1, t1, Rhi);
4221      adc(t2, t2, zr);
4222    }
4223
4224  public:
4225    /**
4226     * Fast Montgomery multiplication.  The derivation of the
4227     * algorithm is in A Cryptographic Library for the Motorola
4228     * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4229     *
4230     * Arguments:
4231     *
4232     * Inputs for multiplication:
4233     *   c_rarg0   - int array elements a
4234     *   c_rarg1   - int array elements b
4235     *   c_rarg2   - int array elements n (the modulus)
4236     *   c_rarg3   - int length
4237     *   c_rarg4   - int inv
4238     *   c_rarg5   - int array elements m (the result)
4239     *
4240     * Inputs for squaring:
4241     *   c_rarg0   - int array elements a
4242     *   c_rarg1   - int array elements n (the modulus)
4243     *   c_rarg2   - int length
4244     *   c_rarg3   - int inv
4245     *   c_rarg4   - int array elements m (the result)
4246     *
4247     */
4248    address generate_multiply() {
4249      Label argh, nothing;
4250      bind(argh);
4251      stop("MontgomeryMultiply total_allocation must be <= 8192");
4252
4253      align(CodeEntryAlignment);
4254      address entry = pc();
4255
4256      cbzw(Rlen, nothing);
4257
4258      enter();
4259
4260      // Make room.
4261      cmpw(Rlen, 512);
4262      br(Assembler::HI, argh);
4263      sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4264      andr(sp, Ra, -2 * wordSize);
4265
4266      lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4267
4268      {
4269        // Copy input args, reversing as we go.  We use Ra as a
4270        // temporary variable.
4271        reverse(Ra, Pa_base, Rlen, t0, t1);
4272        if (!_squaring)
4273          reverse(Ra, Pb_base, Rlen, t0, t1);
4274        reverse(Ra, Pn_base, Rlen, t0, t1);
4275      }
4276
4277      // Push all call-saved registers and also Pm_base which we'll need
4278      // at the end.
4279      save_regs();
4280
4281#ifndef PRODUCT
4282      // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4283      {
4284        ldr(Rn, Address(Pn_base, 0));
4285        mul(Rlo_mn, Rn, inv);
4286        cmp(Rlo_mn, -1);
4287        Label ok;
4288        br(EQ, ok); {
4289          stop("broken inverse in Montgomery multiply");
4290        } bind(ok);
4291      }
4292#endif
4293
4294      mov(Pm_base, Ra);
4295
4296      mov(t0, zr);
4297      mov(t1, zr);
4298      mov(t2, zr);
4299
4300      block_comment("for (int i = 0; i < len; i++) {");
4301      mov(Ri, zr); {
4302        Label loop, end;
4303        cmpw(Ri, Rlen);
4304        br(Assembler::GE, end);
4305
4306        bind(loop);
4307        pre1(Ri);
4308
4309        block_comment("  for (j = i; j; j--) {"); {
4310          movw(Rj, Ri);
4311          unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4312        } block_comment("  } // j");
4313
4314        post1();
4315        addw(Ri, Ri, 1);
4316        cmpw(Ri, Rlen);
4317        br(Assembler::LT, loop);
4318        bind(end);
4319        block_comment("} // i");
4320      }
4321
4322      block_comment("for (int i = len; i < 2*len; i++) {");
4323      mov(Ri, Rlen); {
4324        Label loop, end;
4325        cmpw(Ri, Rlen, Assembler::LSL, 1);
4326        br(Assembler::GE, end);
4327
4328        bind(loop);
4329        pre2(Ri, Rlen);
4330
4331        block_comment("  for (j = len*2-i-1; j; j--) {"); {
4332          lslw(Rj, Rlen, 1);
4333          subw(Rj, Rj, Ri);
4334          subw(Rj, Rj, 1);
4335          unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4336        } block_comment("  } // j");
4337
4338        post2(Ri, Rlen);
4339        addw(Ri, Ri, 1);
4340        cmpw(Ri, Rlen, Assembler::LSL, 1);
4341        br(Assembler::LT, loop);
4342        bind(end);
4343      }
4344      block_comment("} // i");
4345
4346      normalize(Rlen);
4347
4348      mov(Ra, Pm_base);  // Save Pm_base in Ra
4349      restore_regs();  // Restore caller's Pm_base
4350
4351      // Copy our result into caller's Pm_base
4352      reverse(Pm_base, Ra, Rlen, t0, t1);
4353
4354      leave();
4355      bind(nothing);
4356      ret(lr);
4357
4358      return entry;
4359    }
4360    // In C, approximately:
4361
4362    // void
4363    // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4364    //                     unsigned long Pn_base[], unsigned long Pm_base[],
4365    //                     unsigned long inv, int len) {
4366    //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4367    //   unsigned long *Pa, *Pb, *Pn, *Pm;
4368    //   unsigned long Ra, Rb, Rn, Rm;
4369
4370    //   int i;
4371
4372    //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4373
4374    //   for (i = 0; i < len; i++) {
4375    //     int j;
4376
4377    //     Pa = Pa_base;
4378    //     Pb = Pb_base + i;
4379    //     Pm = Pm_base;
4380    //     Pn = Pn_base + i;
4381
4382    //     Ra = *Pa;
4383    //     Rb = *Pb;
4384    //     Rm = *Pm;
4385    //     Rn = *Pn;
4386
4387    //     int iters = i;
4388    //     for (j = 0; iters--; j++) {
4389    //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4390    //       MACC(Ra, Rb, t0, t1, t2);
4391    //       Ra = *++Pa;
4392    //       Rb = *--Pb;
4393    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4394    //       MACC(Rm, Rn, t0, t1, t2);
4395    //       Rm = *++Pm;
4396    //       Rn = *--Pn;
4397    //     }
4398
4399    //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4400    //     MACC(Ra, Rb, t0, t1, t2);
4401    //     *Pm = Rm = t0 * inv;
4402    //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4403    //     MACC(Rm, Rn, t0, t1, t2);
4404
4405    //     assert(t0 == 0, "broken Montgomery multiply");
4406
4407    //     t0 = t1; t1 = t2; t2 = 0;
4408    //   }
4409
4410    //   for (i = len; i < 2*len; i++) {
4411    //     int j;
4412
4413    //     Pa = Pa_base + i-len;
4414    //     Pb = Pb_base + len;
4415    //     Pm = Pm_base + i-len;
4416    //     Pn = Pn_base + len;
4417
4418    //     Ra = *++Pa;
4419    //     Rb = *--Pb;
4420    //     Rm = *++Pm;
4421    //     Rn = *--Pn;
4422
4423    //     int iters = len*2-i-1;
4424    //     for (j = i-len+1; iters--; j++) {
4425    //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4426    //       MACC(Ra, Rb, t0, t1, t2);
4427    //       Ra = *++Pa;
4428    //       Rb = *--Pb;
4429    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4430    //       MACC(Rm, Rn, t0, t1, t2);
4431    //       Rm = *++Pm;
4432    //       Rn = *--Pn;
4433    //     }
4434
4435    //     Pm_base[i-len] = t0;
4436    //     t0 = t1; t1 = t2; t2 = 0;
4437    //   }
4438
4439    //   while (t0)
4440    //     t0 = sub(Pm_base, Pn_base, t0, len);
4441    // }
4442
4443    /**
4444     * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4445     * multiplies than Montgomery multiplication so it should be up to
4446     * 25% faster.  However, its loop control is more complex and it
4447     * may actually run slower on some machines.
4448     *
4449     * Arguments:
4450     *
4451     * Inputs:
4452     *   c_rarg0   - int array elements a
4453     *   c_rarg1   - int array elements n (the modulus)
4454     *   c_rarg2   - int length
4455     *   c_rarg3   - int inv
4456     *   c_rarg4   - int array elements m (the result)
4457     *
4458     */
4459    address generate_square() {
4460      Label argh;
4461      bind(argh);
4462      stop("MontgomeryMultiply total_allocation must be <= 8192");
4463
4464      align(CodeEntryAlignment);
4465      address entry = pc();
4466
4467      enter();
4468
4469      // Make room.
4470      cmpw(Rlen, 512);
4471      br(Assembler::HI, argh);
4472      sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4473      andr(sp, Ra, -2 * wordSize);
4474
4475      lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4476
4477      {
4478        // Copy input args, reversing as we go.  We use Ra as a
4479        // temporary variable.
4480        reverse(Ra, Pa_base, Rlen, t0, t1);
4481        reverse(Ra, Pn_base, Rlen, t0, t1);
4482      }
4483
4484      // Push all call-saved registers and also Pm_base which we'll need
4485      // at the end.
4486      save_regs();
4487
4488      mov(Pm_base, Ra);
4489
4490      mov(t0, zr);
4491      mov(t1, zr);
4492      mov(t2, zr);
4493
4494      block_comment("for (int i = 0; i < len; i++) {");
4495      mov(Ri, zr); {
4496        Label loop, end;
4497        bind(loop);
4498        cmp(Ri, Rlen);
4499        br(Assembler::GE, end);
4500
4501        pre1(Ri);
4502
4503        block_comment("for (j = (i+1)/2; j; j--) {"); {
4504          add(Rj, Ri, 1);
4505          lsr(Rj, Rj, 1);
4506          unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4507        } block_comment("  } // j");
4508
4509        last_squaring(Ri);
4510
4511        block_comment("  for (j = i/2; j; j--) {"); {
4512          lsr(Rj, Ri, 1);
4513          unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4514        } block_comment("  } // j");
4515
4516        post1_squaring();
4517        add(Ri, Ri, 1);
4518        cmp(Ri, Rlen);
4519        br(Assembler::LT, loop);
4520
4521        bind(end);
4522        block_comment("} // i");
4523      }
4524
4525      block_comment("for (int i = len; i < 2*len; i++) {");
4526      mov(Ri, Rlen); {
4527        Label loop, end;
4528        bind(loop);
4529        cmp(Ri, Rlen, Assembler::LSL, 1);
4530        br(Assembler::GE, end);
4531
4532        pre2(Ri, Rlen);
4533
4534        block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4535          lsl(Rj, Rlen, 1);
4536          sub(Rj, Rj, Ri);
4537          sub(Rj, Rj, 1);
4538          lsr(Rj, Rj, 1);
4539          unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4540        } block_comment("  } // j");
4541
4542        last_squaring(Ri);
4543
4544        block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4545          lsl(Rj, Rlen, 1);
4546          sub(Rj, Rj, Ri);
4547          lsr(Rj, Rj, 1);
4548          unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4549        } block_comment("  } // j");
4550
4551        post2(Ri, Rlen);
4552        add(Ri, Ri, 1);
4553        cmp(Ri, Rlen, Assembler::LSL, 1);
4554
4555        br(Assembler::LT, loop);
4556        bind(end);
4557        block_comment("} // i");
4558      }
4559
4560      normalize(Rlen);
4561
4562      mov(Ra, Pm_base);  // Save Pm_base in Ra
4563      restore_regs();  // Restore caller's Pm_base
4564
4565      // Copy our result into caller's Pm_base
4566      reverse(Pm_base, Ra, Rlen, t0, t1);
4567
4568      leave();
4569      ret(lr);
4570
4571      return entry;
4572    }
4573    // In C, approximately:
4574
4575    // void
4576    // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4577    //                   unsigned long Pm_base[], unsigned long inv, int len) {
4578    //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4579    //   unsigned long *Pa, *Pb, *Pn, *Pm;
4580    //   unsigned long Ra, Rb, Rn, Rm;
4581
4582    //   int i;
4583
4584    //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4585
4586    //   for (i = 0; i < len; i++) {
4587    //     int j;
4588
4589    //     Pa = Pa_base;
4590    //     Pb = Pa_base + i;
4591    //     Pm = Pm_base;
4592    //     Pn = Pn_base + i;
4593
4594    //     Ra = *Pa;
4595    //     Rb = *Pb;
4596    //     Rm = *Pm;
4597    //     Rn = *Pn;
4598
4599    //     int iters = (i+1)/2;
4600    //     for (j = 0; iters--; j++) {
4601    //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4602    //       MACC2(Ra, Rb, t0, t1, t2);
4603    //       Ra = *++Pa;
4604    //       Rb = *--Pb;
4605    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4606    //       MACC(Rm, Rn, t0, t1, t2);
4607    //       Rm = *++Pm;
4608    //       Rn = *--Pn;
4609    //     }
4610    //     if ((i & 1) == 0) {
4611    //       assert(Ra == Pa_base[j], "must be");
4612    //       MACC(Ra, Ra, t0, t1, t2);
4613    //     }
4614    //     iters = i/2;
4615    //     assert(iters == i-j, "must be");
4616    //     for (; iters--; j++) {
4617    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4618    //       MACC(Rm, Rn, t0, t1, t2);
4619    //       Rm = *++Pm;
4620    //       Rn = *--Pn;
4621    //     }
4622
4623    //     *Pm = Rm = t0 * inv;
4624    //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4625    //     MACC(Rm, Rn, t0, t1, t2);
4626
4627    //     assert(t0 == 0, "broken Montgomery multiply");
4628
4629    //     t0 = t1; t1 = t2; t2 = 0;
4630    //   }
4631
4632    //   for (i = len; i < 2*len; i++) {
4633    //     int start = i-len+1;
4634    //     int end = start + (len - start)/2;
4635    //     int j;
4636
4637    //     Pa = Pa_base + i-len;
4638    //     Pb = Pa_base + len;
4639    //     Pm = Pm_base + i-len;
4640    //     Pn = Pn_base + len;
4641
4642    //     Ra = *++Pa;
4643    //     Rb = *--Pb;
4644    //     Rm = *++Pm;
4645    //     Rn = *--Pn;
4646
4647    //     int iters = (2*len-i-1)/2;
4648    //     assert(iters == end-start, "must be");
4649    //     for (j = start; iters--; j++) {
4650    //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4651    //       MACC2(Ra, Rb, t0, t1, t2);
4652    //       Ra = *++Pa;
4653    //       Rb = *--Pb;
4654    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4655    //       MACC(Rm, Rn, t0, t1, t2);
4656    //       Rm = *++Pm;
4657    //       Rn = *--Pn;
4658    //     }
4659    //     if ((i & 1) == 0) {
4660    //       assert(Ra == Pa_base[j], "must be");
4661    //       MACC(Ra, Ra, t0, t1, t2);
4662    //     }
4663    //     iters =  (2*len-i)/2;
4664    //     assert(iters == len-j, "must be");
4665    //     for (; iters--; j++) {
4666    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4667    //       MACC(Rm, Rn, t0, t1, t2);
4668    //       Rm = *++Pm;
4669    //       Rn = *--Pn;
4670    //     }
4671    //     Pm_base[i-len] = t0;
4672    //     t0 = t1; t1 = t2; t2 = 0;
4673    //   }
4674
4675    //   while (t0)
4676    //     t0 = sub(Pm_base, Pn_base, t0, len);
4677    // }
4678  };
4679
4680  // Initialization
4681  void generate_initial() {
4682    // Generate initial stubs and initializes the entry points
4683
4684    // entry points that exist in all platforms Note: This is code
4685    // that could be shared among different platforms - however the
4686    // benefit seems to be smaller than the disadvantage of having a
4687    // much more complicated generator structure. See also comment in
4688    // stubRoutines.hpp.
4689
4690    StubRoutines::_forward_exception_entry = generate_forward_exception();
4691
4692    StubRoutines::_call_stub_entry =
4693      generate_call_stub(StubRoutines::_call_stub_return_address);
4694
4695    // is referenced by megamorphic call
4696    StubRoutines::_catch_exception_entry = generate_catch_exception();
4697
4698    // Build this early so it's available for the interpreter.
4699    StubRoutines::_throw_StackOverflowError_entry =
4700      generate_throw_exception("StackOverflowError throw_exception",
4701                               CAST_FROM_FN_PTR(address,
4702                                                SharedRuntime::throw_StackOverflowError));
4703    StubRoutines::_throw_delayed_StackOverflowError_entry =
4704      generate_throw_exception("delayed StackOverflowError throw_exception",
4705                               CAST_FROM_FN_PTR(address,
4706                                                SharedRuntime::throw_delayed_StackOverflowError));
4707    if (UseCRC32Intrinsics) {
4708      // set table address before stub generation which use it
4709      StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4710      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4711    }
4712  }
4713
4714  void generate_all() {
4715    // support for verify_oop (must happen after universe_init)
4716    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4717    StubRoutines::_throw_AbstractMethodError_entry =
4718      generate_throw_exception("AbstractMethodError throw_exception",
4719                               CAST_FROM_FN_PTR(address,
4720                                                SharedRuntime::
4721                                                throw_AbstractMethodError));
4722
4723    StubRoutines::_throw_IncompatibleClassChangeError_entry =
4724      generate_throw_exception("IncompatibleClassChangeError throw_exception",
4725                               CAST_FROM_FN_PTR(address,
4726                                                SharedRuntime::
4727                                                throw_IncompatibleClassChangeError));
4728
4729    StubRoutines::_throw_NullPointerException_at_call_entry =
4730      generate_throw_exception("NullPointerException at call throw_exception",
4731                               CAST_FROM_FN_PTR(address,
4732                                                SharedRuntime::
4733                                                throw_NullPointerException_at_call));
4734
4735    // arraycopy stubs used by compilers
4736    generate_arraycopy_stubs();
4737
4738    if (UseMultiplyToLenIntrinsic) {
4739      StubRoutines::_multiplyToLen = generate_multiplyToLen();
4740    }
4741
4742    if (UseMontgomeryMultiplyIntrinsic) {
4743      StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4744      MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4745      StubRoutines::_montgomeryMultiply = g.generate_multiply();
4746    }
4747
4748    if (UseMontgomerySquareIntrinsic) {
4749      StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4750      MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4751      // We use generate_multiply() rather than generate_square()
4752      // because it's faster for the sizes of modulus we care about.
4753      StubRoutines::_montgomerySquare = g.generate_multiply();
4754    }
4755
4756#ifndef BUILTIN_SIM
4757    // generate GHASH intrinsics code
4758    if (UseGHASHIntrinsics) {
4759      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4760    }
4761
4762    if (UseAESIntrinsics) {
4763      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4764      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4765      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4766      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4767    }
4768
4769    if (UseSHA1Intrinsics) {
4770      StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4771      StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4772    }
4773    if (UseSHA256Intrinsics) {
4774      StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4775      StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4776    }
4777
4778    if (UseCRC32CIntrinsics) {
4779      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4780    }
4781
4782    // generate Adler32 intrinsics code
4783    if (UseAdler32Intrinsics) {
4784      StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
4785    }
4786
4787    // Safefetch stubs.
4788    generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4789                                                       &StubRoutines::_safefetch32_fault_pc,
4790                                                       &StubRoutines::_safefetch32_continuation_pc);
4791    generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4792                                                       &StubRoutines::_safefetchN_fault_pc,
4793                                                       &StubRoutines::_safefetchN_continuation_pc);
4794#endif
4795    StubRoutines::aarch64::set_completed();
4796  }
4797
4798 public:
4799  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4800    if (all) {
4801      generate_all();
4802    } else {
4803      generate_initial();
4804    }
4805  }
4806}; // end class declaration
4807
4808void StubGenerator_generate(CodeBuffer* code, bool all) {
4809  StubGenerator g(code, all);
4810}
4811