1/*
2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26#include "precompiled.hpp"
27#include "asm/macroAssembler.hpp"
28#include "asm/macroAssembler.inline.hpp"
29#include "interpreter/interpreter.hpp"
30#include "nativeInst_aarch64.hpp"
31#include "oops/instanceOop.hpp"
32#include "oops/method.hpp"
33#include "oops/objArrayKlass.hpp"
34#include "oops/oop.inline.hpp"
35#include "prims/methodHandles.hpp"
36#include "runtime/frame.inline.hpp"
37#include "runtime/handles.inline.hpp"
38#include "runtime/sharedRuntime.hpp"
39#include "runtime/stubCodeGenerator.hpp"
40#include "runtime/stubRoutines.hpp"
41#include "runtime/thread.inline.hpp"
42#include "utilities/align.hpp"
43#ifdef COMPILER2
44#include "opto/runtime.hpp"
45#endif
46
47#ifdef BUILTIN_SIM
48#include "../../../../../../simulator/simulator.hpp"
49#endif
50
51// Declaration and definition of StubGenerator (no .hpp file).
52// For a more detailed description of the stub routine structure
53// see the comment in stubRoutines.hpp
54
55#undef __
56#define __ _masm->
57#define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
58
59#ifdef PRODUCT
60#define BLOCK_COMMENT(str) /* nothing */
61#else
62#define BLOCK_COMMENT(str) __ block_comment(str)
63#endif
64
65#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
66
67// Stub Code definitions
68
69class StubGenerator: public StubCodeGenerator {
70 private:
71
72#ifdef PRODUCT
73#define inc_counter_np(counter) ((void)0)
74#else
75  void inc_counter_np_(int& counter) {
76    __ lea(rscratch2, ExternalAddress((address)&counter));
77    __ ldrw(rscratch1, Address(rscratch2));
78    __ addw(rscratch1, rscratch1, 1);
79    __ strw(rscratch1, Address(rscratch2));
80  }
81#define inc_counter_np(counter) \
82  BLOCK_COMMENT("inc_counter " #counter); \
83  inc_counter_np_(counter);
84#endif
85
86  // Call stubs are used to call Java from C
87  //
88  // Arguments:
89  //    c_rarg0:   call wrapper address                   address
90  //    c_rarg1:   result                                 address
91  //    c_rarg2:   result type                            BasicType
92  //    c_rarg3:   method                                 Method*
93  //    c_rarg4:   (interpreter) entry point              address
94  //    c_rarg5:   parameters                             intptr_t*
95  //    c_rarg6:   parameter size (in words)              int
96  //    c_rarg7:   thread                                 Thread*
97  //
98  // There is no return from the stub itself as any Java result
99  // is written to result
100  //
101  // we save r30 (lr) as the return PC at the base of the frame and
102  // link r29 (fp) below it as the frame pointer installing sp (r31)
103  // into fp.
104  //
105  // we save r0-r7, which accounts for all the c arguments.
106  //
107  // TODO: strictly do we need to save them all? they are treated as
108  // volatile by C so could we omit saving the ones we are going to
109  // place in global registers (thread? method?) or those we only use
110  // during setup of the Java call?
111  //
112  // we don't need to save r8 which C uses as an indirect result location
113  // return register.
114  //
115  // we don't need to save r9-r15 which both C and Java treat as
116  // volatile
117  //
118  // we don't need to save r16-18 because Java does not use them
119  //
120  // we save r19-r28 which Java uses as scratch registers and C
121  // expects to be callee-save
122  //
123  // we save the bottom 64 bits of each value stored in v8-v15; it is
124  // the responsibility of the caller to preserve larger values.
125  //
126  // so the stub frame looks like this when we enter Java code
127  //
128  //     [ return_from_Java     ] <--- sp
129  //     [ argument word n      ]
130  //      ...
131  // -27 [ argument word 1      ]
132  // -26 [ saved v15            ] <--- sp_after_call
133  // -25 [ saved v14            ]
134  // -24 [ saved v13            ]
135  // -23 [ saved v12            ]
136  // -22 [ saved v11            ]
137  // -21 [ saved v10            ]
138  // -20 [ saved v9             ]
139  // -19 [ saved v8             ]
140  // -18 [ saved r28            ]
141  // -17 [ saved r27            ]
142  // -16 [ saved r26            ]
143  // -15 [ saved r25            ]
144  // -14 [ saved r24            ]
145  // -13 [ saved r23            ]
146  // -12 [ saved r22            ]
147  // -11 [ saved r21            ]
148  // -10 [ saved r20            ]
149  //  -9 [ saved r19            ]
150  //  -8 [ call wrapper    (r0) ]
151  //  -7 [ result          (r1) ]
152  //  -6 [ result type     (r2) ]
153  //  -5 [ method          (r3) ]
154  //  -4 [ entry point     (r4) ]
155  //  -3 [ parameters      (r5) ]
156  //  -2 [ parameter size  (r6) ]
157  //  -1 [ thread (r7)          ]
158  //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
159  //   1 [ saved lr       (r30) ]
160
161  // Call stub stack layout word offsets from fp
162  enum call_stub_layout {
163    sp_after_call_off = -26,
164
165    d15_off            = -26,
166    d13_off            = -24,
167    d11_off            = -22,
168    d9_off             = -20,
169
170    r28_off            = -18,
171    r26_off            = -16,
172    r24_off            = -14,
173    r22_off            = -12,
174    r20_off            = -10,
175    call_wrapper_off   =  -8,
176    result_off         =  -7,
177    result_type_off    =  -6,
178    method_off         =  -5,
179    entry_point_off    =  -4,
180    parameter_size_off =  -2,
181    thread_off         =  -1,
182    fp_f               =   0,
183    retaddr_off        =   1,
184  };
185
186  address generate_call_stub(address& return_address) {
187    assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
188           (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
189           "adjust this code");
190
191    StubCodeMark mark(this, "StubRoutines", "call_stub");
192    address start = __ pc();
193
194    const Address sp_after_call(rfp, sp_after_call_off * wordSize);
195
196    const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
197    const Address result        (rfp, result_off         * wordSize);
198    const Address result_type   (rfp, result_type_off    * wordSize);
199    const Address method        (rfp, method_off         * wordSize);
200    const Address entry_point   (rfp, entry_point_off    * wordSize);
201    const Address parameter_size(rfp, parameter_size_off * wordSize);
202
203    const Address thread        (rfp, thread_off         * wordSize);
204
205    const Address d15_save      (rfp, d15_off * wordSize);
206    const Address d13_save      (rfp, d13_off * wordSize);
207    const Address d11_save      (rfp, d11_off * wordSize);
208    const Address d9_save       (rfp, d9_off * wordSize);
209
210    const Address r28_save      (rfp, r28_off * wordSize);
211    const Address r26_save      (rfp, r26_off * wordSize);
212    const Address r24_save      (rfp, r24_off * wordSize);
213    const Address r22_save      (rfp, r22_off * wordSize);
214    const Address r20_save      (rfp, r20_off * wordSize);
215
216    // stub code
217
218    // we need a C prolog to bootstrap the x86 caller into the sim
219    __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
220
221    address aarch64_entry = __ pc();
222
223#ifdef BUILTIN_SIM
224    // Save sender's SP for stack traces.
225    __ mov(rscratch1, sp);
226    __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
227#endif
228    // set up frame and move sp to end of save area
229    __ enter();
230    __ sub(sp, rfp, -sp_after_call_off * wordSize);
231
232    // save register parameters and Java scratch/global registers
233    // n.b. we save thread even though it gets installed in
234    // rthread because we want to sanity check rthread later
235    __ str(c_rarg7,  thread);
236    __ strw(c_rarg6, parameter_size);
237    __ stp(c_rarg4, c_rarg5,  entry_point);
238    __ stp(c_rarg2, c_rarg3,  result_type);
239    __ stp(c_rarg0, c_rarg1,  call_wrapper);
240
241    __ stp(r20, r19,   r20_save);
242    __ stp(r22, r21,   r22_save);
243    __ stp(r24, r23,   r24_save);
244    __ stp(r26, r25,   r26_save);
245    __ stp(r28, r27,   r28_save);
246
247    __ stpd(v9,  v8,   d9_save);
248    __ stpd(v11, v10,  d11_save);
249    __ stpd(v13, v12,  d13_save);
250    __ stpd(v15, v14,  d15_save);
251
252    // install Java thread in global register now we have saved
253    // whatever value it held
254    __ mov(rthread, c_rarg7);
255    // And method
256    __ mov(rmethod, c_rarg3);
257
258    // set up the heapbase register
259    __ reinit_heapbase();
260
261#ifdef ASSERT
262    // make sure we have no pending exceptions
263    {
264      Label L;
265      __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
266      __ cmp(rscratch1, (unsigned)NULL_WORD);
267      __ br(Assembler::EQ, L);
268      __ stop("StubRoutines::call_stub: entered with pending exception");
269      __ BIND(L);
270    }
271#endif
272    // pass parameters if any
273    __ mov(esp, sp);
274    __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
275    __ andr(sp, rscratch1, -2 * wordSize);
276
277    BLOCK_COMMENT("pass parameters if any");
278    Label parameters_done;
279    // parameter count is still in c_rarg6
280    // and parameter pointer identifying param 1 is in c_rarg5
281    __ cbzw(c_rarg6, parameters_done);
282
283    address loop = __ pc();
284    __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
285    __ subsw(c_rarg6, c_rarg6, 1);
286    __ push(rscratch1);
287    __ br(Assembler::GT, loop);
288
289    __ BIND(parameters_done);
290
291    // call Java entry -- passing methdoOop, and current sp
292    //      rmethod: Method*
293    //      r13: sender sp
294    BLOCK_COMMENT("call Java function");
295    __ mov(r13, sp);
296    __ blr(c_rarg4);
297
298    // tell the simulator we have returned to the stub
299
300    // we do this here because the notify will already have been done
301    // if we get to the next instruction via an exception
302    //
303    // n.b. adding this instruction here affects the calculation of
304    // whether or not a routine returns to the call stub (used when
305    // doing stack walks) since the normal test is to check the return
306    // pc against the address saved below. so we may need to allow for
307    // this extra instruction in the check.
308
309    if (NotifySimulator) {
310      __ notify(Assembler::method_reentry);
311    }
312    // save current address for use by exception handling code
313
314    return_address = __ pc();
315
316    // store result depending on type (everything that is not
317    // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
318    // n.b. this assumes Java returns an integral result in r0
319    // and a floating result in j_farg0
320    __ ldr(j_rarg2, result);
321    Label is_long, is_float, is_double, exit;
322    __ ldr(j_rarg1, result_type);
323    __ cmp(j_rarg1, T_OBJECT);
324    __ br(Assembler::EQ, is_long);
325    __ cmp(j_rarg1, T_LONG);
326    __ br(Assembler::EQ, is_long);
327    __ cmp(j_rarg1, T_FLOAT);
328    __ br(Assembler::EQ, is_float);
329    __ cmp(j_rarg1, T_DOUBLE);
330    __ br(Assembler::EQ, is_double);
331
332    // handle T_INT case
333    __ strw(r0, Address(j_rarg2));
334
335    __ BIND(exit);
336
337    // pop parameters
338    __ sub(esp, rfp, -sp_after_call_off * wordSize);
339
340#ifdef ASSERT
341    // verify that threads correspond
342    {
343      Label L, S;
344      __ ldr(rscratch1, thread);
345      __ cmp(rthread, rscratch1);
346      __ br(Assembler::NE, S);
347      __ get_thread(rscratch1);
348      __ cmp(rthread, rscratch1);
349      __ br(Assembler::EQ, L);
350      __ BIND(S);
351      __ stop("StubRoutines::call_stub: threads must correspond");
352      __ BIND(L);
353    }
354#endif
355
356    // restore callee-save registers
357    __ ldpd(v15, v14,  d15_save);
358    __ ldpd(v13, v12,  d13_save);
359    __ ldpd(v11, v10,  d11_save);
360    __ ldpd(v9,  v8,   d9_save);
361
362    __ ldp(r28, r27,   r28_save);
363    __ ldp(r26, r25,   r26_save);
364    __ ldp(r24, r23,   r24_save);
365    __ ldp(r22, r21,   r22_save);
366    __ ldp(r20, r19,   r20_save);
367
368    __ ldp(c_rarg0, c_rarg1,  call_wrapper);
369    __ ldrw(c_rarg2, result_type);
370    __ ldr(c_rarg3,  method);
371    __ ldp(c_rarg4, c_rarg5,  entry_point);
372    __ ldp(c_rarg6, c_rarg7,  parameter_size);
373
374#ifndef PRODUCT
375    // tell the simulator we are about to end Java execution
376    if (NotifySimulator) {
377      __ notify(Assembler::method_exit);
378    }
379#endif
380    // leave frame and return to caller
381    __ leave();
382    __ ret(lr);
383
384    // handle return types different from T_INT
385
386    __ BIND(is_long);
387    __ str(r0, Address(j_rarg2, 0));
388    __ br(Assembler::AL, exit);
389
390    __ BIND(is_float);
391    __ strs(j_farg0, Address(j_rarg2, 0));
392    __ br(Assembler::AL, exit);
393
394    __ BIND(is_double);
395    __ strd(j_farg0, Address(j_rarg2, 0));
396    __ br(Assembler::AL, exit);
397
398    return start;
399  }
400
401  // Return point for a Java call if there's an exception thrown in
402  // Java code.  The exception is caught and transformed into a
403  // pending exception stored in JavaThread that can be tested from
404  // within the VM.
405  //
406  // Note: Usually the parameters are removed by the callee. In case
407  // of an exception crossing an activation frame boundary, that is
408  // not the case if the callee is compiled code => need to setup the
409  // rsp.
410  //
411  // r0: exception oop
412
413  // NOTE: this is used as a target from the signal handler so it
414  // needs an x86 prolog which returns into the current simulator
415  // executing the generated catch_exception code. so the prolog
416  // needs to install rax in a sim register and adjust the sim's
417  // restart pc to enter the generated code at the start position
418  // then return from native to simulated execution.
419
420  address generate_catch_exception() {
421    StubCodeMark mark(this, "StubRoutines", "catch_exception");
422    address start = __ pc();
423
424    // same as in generate_call_stub():
425    const Address sp_after_call(rfp, sp_after_call_off * wordSize);
426    const Address thread        (rfp, thread_off         * wordSize);
427
428#ifdef ASSERT
429    // verify that threads correspond
430    {
431      Label L, S;
432      __ ldr(rscratch1, thread);
433      __ cmp(rthread, rscratch1);
434      __ br(Assembler::NE, S);
435      __ get_thread(rscratch1);
436      __ cmp(rthread, rscratch1);
437      __ br(Assembler::EQ, L);
438      __ bind(S);
439      __ stop("StubRoutines::catch_exception: threads must correspond");
440      __ bind(L);
441    }
442#endif
443
444    // set pending exception
445    __ verify_oop(r0);
446
447    __ str(r0, Address(rthread, Thread::pending_exception_offset()));
448    __ mov(rscratch1, (address)__FILE__);
449    __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
450    __ movw(rscratch1, (int)__LINE__);
451    __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
452
453    // complete return to VM
454    assert(StubRoutines::_call_stub_return_address != NULL,
455           "_call_stub_return_address must have been generated before");
456    __ b(StubRoutines::_call_stub_return_address);
457
458    return start;
459  }
460
461  // Continuation point for runtime calls returning with a pending
462  // exception.  The pending exception check happened in the runtime
463  // or native call stub.  The pending exception in Thread is
464  // converted into a Java-level exception.
465  //
466  // Contract with Java-level exception handlers:
467  // r0: exception
468  // r3: throwing pc
469  //
470  // NOTE: At entry of this stub, exception-pc must be in LR !!
471
472  // NOTE: this is always used as a jump target within generated code
473  // so it just needs to be generated code wiht no x86 prolog
474
475  address generate_forward_exception() {
476    StubCodeMark mark(this, "StubRoutines", "forward exception");
477    address start = __ pc();
478
479    // Upon entry, LR points to the return address returning into
480    // Java (interpreted or compiled) code; i.e., the return address
481    // becomes the throwing pc.
482    //
483    // Arguments pushed before the runtime call are still on the stack
484    // but the exception handler will reset the stack pointer ->
485    // ignore them.  A potential result in registers can be ignored as
486    // well.
487
488#ifdef ASSERT
489    // make sure this code is only executed if there is a pending exception
490    {
491      Label L;
492      __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
493      __ cbnz(rscratch1, L);
494      __ stop("StubRoutines::forward exception: no pending exception (1)");
495      __ bind(L);
496    }
497#endif
498
499    // compute exception handler into r19
500
501    // call the VM to find the handler address associated with the
502    // caller address. pass thread in r0 and caller pc (ret address)
503    // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
504    // the stack.
505    __ mov(c_rarg1, lr);
506    // lr will be trashed by the VM call so we move it to R19
507    // (callee-saved) because we also need to pass it to the handler
508    // returned by this call.
509    __ mov(r19, lr);
510    BLOCK_COMMENT("call exception_handler_for_return_address");
511    __ call_VM_leaf(CAST_FROM_FN_PTR(address,
512                         SharedRuntime::exception_handler_for_return_address),
513                    rthread, c_rarg1);
514    // we should not really care that lr is no longer the callee
515    // address. we saved the value the handler needs in r19 so we can
516    // just copy it to r3. however, the C2 handler will push its own
517    // frame and then calls into the VM and the VM code asserts that
518    // the PC for the frame above the handler belongs to a compiled
519    // Java method. So, we restore lr here to satisfy that assert.
520    __ mov(lr, r19);
521    // setup r0 & r3 & clear pending exception
522    __ mov(r3, r19);
523    __ mov(r19, r0);
524    __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
525    __ str(zr, Address(rthread, Thread::pending_exception_offset()));
526
527#ifdef ASSERT
528    // make sure exception is set
529    {
530      Label L;
531      __ cbnz(r0, L);
532      __ stop("StubRoutines::forward exception: no pending exception (2)");
533      __ bind(L);
534    }
535#endif
536
537    // continue at exception handler
538    // r0: exception
539    // r3: throwing pc
540    // r19: exception handler
541    __ verify_oop(r0);
542    __ br(r19);
543
544    return start;
545  }
546
547  // Non-destructive plausibility checks for oops
548  //
549  // Arguments:
550  //    r0: oop to verify
551  //    rscratch1: error message
552  //
553  // Stack after saving c_rarg3:
554  //    [tos + 0]: saved c_rarg3
555  //    [tos + 1]: saved c_rarg2
556  //    [tos + 2]: saved lr
557  //    [tos + 3]: saved rscratch2
558  //    [tos + 4]: saved r0
559  //    [tos + 5]: saved rscratch1
560  address generate_verify_oop() {
561
562    StubCodeMark mark(this, "StubRoutines", "verify_oop");
563    address start = __ pc();
564
565    Label exit, error;
566
567    // save c_rarg2 and c_rarg3
568    __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
569
570    // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
571    __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
572    __ ldr(c_rarg3, Address(c_rarg2));
573    __ add(c_rarg3, c_rarg3, 1);
574    __ str(c_rarg3, Address(c_rarg2));
575
576    // object is in r0
577    // make sure object is 'reasonable'
578    __ cbz(r0, exit); // if obj is NULL it is OK
579
580    // Check if the oop is in the right area of memory
581    __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
582    __ andr(c_rarg2, r0, c_rarg3);
583    __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
584
585    // Compare c_rarg2 and c_rarg3.  We don't use a compare
586    // instruction here because the flags register is live.
587    __ eor(c_rarg2, c_rarg2, c_rarg3);
588    __ cbnz(c_rarg2, error);
589
590    // make sure klass is 'reasonable', which is not zero.
591    __ load_klass(r0, r0);  // get klass
592    __ cbz(r0, error);      // if klass is NULL it is broken
593
594    // return if everything seems ok
595    __ bind(exit);
596
597    __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
598    __ ret(lr);
599
600    // handle errors
601    __ bind(error);
602    __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
603
604    __ push(RegSet::range(r0, r29), sp);
605    // debug(char* msg, int64_t pc, int64_t regs[])
606    __ mov(c_rarg0, rscratch1);      // pass address of error message
607    __ mov(c_rarg1, lr);             // pass return address
608    __ mov(c_rarg2, sp);             // pass address of regs on stack
609#ifndef PRODUCT
610    assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
611#endif
612    BLOCK_COMMENT("call MacroAssembler::debug");
613    __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
614    __ blrt(rscratch1, 3, 0, 1);
615
616    return start;
617  }
618
619  void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
620
621  // Generate code for an array write pre barrier
622  //
623  //     addr       - starting address
624  //     count      - element count
625  //     tmp        - scratch register
626  //     saved_regs - registers to be saved before calling static_write_ref_array_pre
627  //
628  //     Callers must specify which registers to preserve in saved_regs.
629  //     Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs.
630  //
631  void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized, RegSet saved_regs) {
632    BarrierSet* bs = Universe::heap()->barrier_set();
633    switch (bs->kind()) {
634    case BarrierSet::G1SATBCTLogging:
635      // With G1, don't generate the call if we statically know that the target in uninitialized
636      if (!dest_uninitialized) {
637        __ push(saved_regs, sp);
638        if (count == c_rarg0) {
639          if (addr == c_rarg1) {
640            // exactly backwards!!
641            __ mov(rscratch1, c_rarg0);
642            __ mov(c_rarg0, c_rarg1);
643            __ mov(c_rarg1, rscratch1);
644          } else {
645            __ mov(c_rarg1, count);
646            __ mov(c_rarg0, addr);
647          }
648        } else {
649          __ mov(c_rarg0, addr);
650          __ mov(c_rarg1, count);
651        }
652        __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
653        __ pop(saved_regs, sp);
654        break;
655      case BarrierSet::CardTableForRS:
656      case BarrierSet::CardTableExtension:
657      case BarrierSet::ModRef:
658        break;
659      default:
660        ShouldNotReachHere();
661
662      }
663    }
664  }
665
666  //
667  // Generate code for an array write post barrier
668  //
669  //  Input:
670  //     start      - register containing starting address of destination array
671  //     end        - register containing ending address of destination array
672  //     scratch    - scratch register
673  //     saved_regs - registers to be saved before calling static_write_ref_array_post
674  //
675  //  The input registers are overwritten.
676  //  The ending address is inclusive.
677  //  Callers must specify which registers to preserve in saved_regs.
678  //  Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs.
679  void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch, RegSet saved_regs) {
680    assert_different_registers(start, end, scratch);
681    BarrierSet* bs = Universe::heap()->barrier_set();
682    switch (bs->kind()) {
683      case BarrierSet::G1SATBCTLogging:
684
685        {
686          __ push(saved_regs, sp);
687          // must compute element count unless barrier set interface is changed (other platforms supply count)
688          assert_different_registers(start, end, scratch);
689          __ lea(scratch, Address(end, BytesPerHeapOop));
690          __ sub(scratch, scratch, start);               // subtract start to get #bytes
691          __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
692          __ mov(c_rarg0, start);
693          __ mov(c_rarg1, scratch);
694          __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
695          __ pop(saved_regs, sp);
696        }
697        break;
698      case BarrierSet::CardTableForRS:
699      case BarrierSet::CardTableExtension:
700        {
701          CardTableModRefBS* ct = (CardTableModRefBS*)bs;
702          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
703
704          Label L_loop;
705
706           __ lsr(start, start, CardTableModRefBS::card_shift);
707           __ lsr(end, end, CardTableModRefBS::card_shift);
708           __ sub(end, end, start); // number of bytes to copy
709
710          const Register count = end; // 'end' register contains bytes count now
711          __ load_byte_map_base(scratch);
712          __ add(start, start, scratch);
713          if (UseConcMarkSweepGC) {
714            __ membar(__ StoreStore);
715          }
716          __ BIND(L_loop);
717          __ strb(zr, Address(start, count));
718          __ subs(count, count, 1);
719          __ br(Assembler::GE, L_loop);
720        }
721        break;
722      default:
723        ShouldNotReachHere();
724
725    }
726  }
727
728  // The inner part of zero_words().  This is the bulk operation,
729  // zeroing words in blocks, possibly using DC ZVA to do it.  The
730  // caller is responsible for zeroing the last few words.
731  //
732  // Inputs:
733  // r10: the HeapWord-aligned base address of an array to zero.
734  // r11: the count in HeapWords, r11 > 0.
735  //
736  // Returns r10 and r11, adjusted for the caller to clear.
737  // r10: the base address of the tail of words left to clear.
738  // r11: the number of words in the tail.
739  //      r11 < MacroAssembler::zero_words_block_size.
740
741  address generate_zero_blocks() {
742    Label store_pair, loop_store_pair, done;
743    Label base_aligned;
744
745    Register base = r10, cnt = r11;
746
747    __ align(CodeEntryAlignment);
748    StubCodeMark mark(this, "StubRoutines", "zero_blocks");
749    address start = __ pc();
750
751    if (UseBlockZeroing) {
752      int zva_length = VM_Version::zva_length();
753
754      // Ensure ZVA length can be divided by 16. This is required by
755      // the subsequent operations.
756      assert (zva_length % 16 == 0, "Unexpected ZVA Length");
757
758      __ tbz(base, 3, base_aligned);
759      __ str(zr, Address(__ post(base, 8)));
760      __ sub(cnt, cnt, 1);
761      __ bind(base_aligned);
762
763      // Ensure count >= zva_length * 2 so that it still deserves a zva after
764      // alignment.
765      Label small;
766      int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
767      __ subs(rscratch1, cnt, low_limit >> 3);
768      __ br(Assembler::LT, small);
769      __ zero_dcache_blocks(base, cnt);
770      __ bind(small);
771    }
772
773    {
774      // Number of stp instructions we'll unroll
775      const int unroll =
776        MacroAssembler::zero_words_block_size / 2;
777      // Clear the remaining blocks.
778      Label loop;
779      __ subs(cnt, cnt, unroll * 2);
780      __ br(Assembler::LT, done);
781      __ bind(loop);
782      for (int i = 0; i < unroll; i++)
783        __ stp(zr, zr, __ post(base, 16));
784      __ subs(cnt, cnt, unroll * 2);
785      __ br(Assembler::GE, loop);
786      __ bind(done);
787      __ add(cnt, cnt, unroll * 2);
788    }
789
790    __ ret(lr);
791
792    return start;
793  }
794
795
796  typedef enum {
797    copy_forwards = 1,
798    copy_backwards = -1
799  } copy_direction;
800
801  // Bulk copy of blocks of 8 words.
802  //
803  // count is a count of words.
804  //
805  // Precondition: count >= 8
806  //
807  // Postconditions:
808  //
809  // The least significant bit of count contains the remaining count
810  // of words to copy.  The rest of count is trash.
811  //
812  // s and d are adjusted to point to the remaining words to copy
813  //
814  void generate_copy_longs(Label &start, Register s, Register d, Register count,
815                           copy_direction direction) {
816    int unit = wordSize * direction;
817    int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
818
819    int offset;
820    const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
821      t4 = r7, t5 = r10, t6 = r11, t7 = r12;
822    const Register stride = r13;
823
824    assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
825    assert_different_registers(s, d, count, rscratch1);
826
827    Label again, drain;
828    const char *stub_name;
829    if (direction == copy_forwards)
830      stub_name = "forward_copy_longs";
831    else
832      stub_name = "backward_copy_longs";
833    StubCodeMark mark(this, "StubRoutines", stub_name);
834    __ align(CodeEntryAlignment);
835    __ bind(start);
836
837    Label unaligned_copy_long;
838    if (AvoidUnalignedAccesses) {
839      __ tbnz(d, 3, unaligned_copy_long);
840    }
841
842    if (direction == copy_forwards) {
843      __ sub(s, s, bias);
844      __ sub(d, d, bias);
845    }
846
847#ifdef ASSERT
848    // Make sure we are never given < 8 words
849    {
850      Label L;
851      __ cmp(count, 8);
852      __ br(Assembler::GE, L);
853      __ stop("genrate_copy_longs called with < 8 words");
854      __ bind(L);
855    }
856#endif
857
858    // Fill 8 registers
859    if (UseSIMDForMemoryOps) {
860      __ ldpq(v0, v1, Address(s, 4 * unit));
861      __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
862    } else {
863      __ ldp(t0, t1, Address(s, 2 * unit));
864      __ ldp(t2, t3, Address(s, 4 * unit));
865      __ ldp(t4, t5, Address(s, 6 * unit));
866      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
867    }
868
869    __ subs(count, count, 16);
870    __ br(Assembler::LO, drain);
871
872    int prefetch = PrefetchCopyIntervalInBytes;
873    bool use_stride = false;
874    if (direction == copy_backwards) {
875       use_stride = prefetch > 256;
876       prefetch = -prefetch;
877       if (use_stride) __ mov(stride, prefetch);
878    }
879
880    __ bind(again);
881
882    if (PrefetchCopyIntervalInBytes > 0)
883      __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
884
885    if (UseSIMDForMemoryOps) {
886      __ stpq(v0, v1, Address(d, 4 * unit));
887      __ ldpq(v0, v1, Address(s, 4 * unit));
888      __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
889      __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
890    } else {
891      __ stp(t0, t1, Address(d, 2 * unit));
892      __ ldp(t0, t1, Address(s, 2 * unit));
893      __ stp(t2, t3, Address(d, 4 * unit));
894      __ ldp(t2, t3, Address(s, 4 * unit));
895      __ stp(t4, t5, Address(d, 6 * unit));
896      __ ldp(t4, t5, Address(s, 6 * unit));
897      __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
898      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
899    }
900
901    __ subs(count, count, 8);
902    __ br(Assembler::HS, again);
903
904    // Drain
905    __ bind(drain);
906    if (UseSIMDForMemoryOps) {
907      __ stpq(v0, v1, Address(d, 4 * unit));
908      __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
909    } else {
910      __ stp(t0, t1, Address(d, 2 * unit));
911      __ stp(t2, t3, Address(d, 4 * unit));
912      __ stp(t4, t5, Address(d, 6 * unit));
913      __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
914    }
915
916    {
917      Label L1, L2;
918      __ tbz(count, exact_log2(4), L1);
919      if (UseSIMDForMemoryOps) {
920        __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
921        __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
922      } else {
923        __ ldp(t0, t1, Address(s, 2 * unit));
924        __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
925        __ stp(t0, t1, Address(d, 2 * unit));
926        __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
927      }
928      __ bind(L1);
929
930      if (direction == copy_forwards) {
931        __ add(s, s, bias);
932        __ add(d, d, bias);
933      }
934
935      __ tbz(count, 1, L2);
936      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
937      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
938      __ bind(L2);
939    }
940
941    __ ret(lr);
942
943    if (AvoidUnalignedAccesses) {
944      Label drain, again;
945      // Register order for storing. Order is different for backward copy.
946
947      __ bind(unaligned_copy_long);
948
949      // source address is even aligned, target odd aligned
950      //
951      // when forward copying word pairs we read long pairs at offsets
952      // {0, 2, 4, 6} (in long words). when backwards copying we read
953      // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
954      // address by -2 in the forwards case so we can compute the
955      // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
956      // or -1.
957      //
958      // when forward copying we need to store 1 word, 3 pairs and
959      // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
960      // zero offset We adjust the destination by -1 which means we
961      // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
962      //
963      // When backwards copyng we need to store 1 word, 3 pairs and
964      // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
965      // offsets {1, 3, 5, 7, 8} * unit.
966
967      if (direction == copy_forwards) {
968        __ sub(s, s, 16);
969        __ sub(d, d, 8);
970      }
971
972      // Fill 8 registers
973      //
974      // for forwards copy s was offset by -16 from the original input
975      // value of s so the register contents are at these offsets
976      // relative to the 64 bit block addressed by that original input
977      // and so on for each successive 64 byte block when s is updated
978      //
979      // t0 at offset 0,  t1 at offset 8
980      // t2 at offset 16, t3 at offset 24
981      // t4 at offset 32, t5 at offset 40
982      // t6 at offset 48, t7 at offset 56
983
984      // for backwards copy s was not offset so the register contents
985      // are at these offsets into the preceding 64 byte block
986      // relative to that original input and so on for each successive
987      // preceding 64 byte block when s is updated. this explains the
988      // slightly counter-intuitive looking pattern of register usage
989      // in the stp instructions for backwards copy.
990      //
991      // t0 at offset -16, t1 at offset -8
992      // t2 at offset -32, t3 at offset -24
993      // t4 at offset -48, t5 at offset -40
994      // t6 at offset -64, t7 at offset -56
995
996      __ ldp(t0, t1, Address(s, 2 * unit));
997      __ ldp(t2, t3, Address(s, 4 * unit));
998      __ ldp(t4, t5, Address(s, 6 * unit));
999      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1000
1001      __ subs(count, count, 16);
1002      __ br(Assembler::LO, drain);
1003
1004      int prefetch = PrefetchCopyIntervalInBytes;
1005      bool use_stride = false;
1006      if (direction == copy_backwards) {
1007         use_stride = prefetch > 256;
1008         prefetch = -prefetch;
1009         if (use_stride) __ mov(stride, prefetch);
1010      }
1011
1012      __ bind(again);
1013
1014      if (PrefetchCopyIntervalInBytes > 0)
1015        __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1016
1017      if (direction == copy_forwards) {
1018       // allowing for the offset of -8 the store instructions place
1019       // registers into the target 64 bit block at the following
1020       // offsets
1021       //
1022       // t0 at offset 0
1023       // t1 at offset 8,  t2 at offset 16
1024       // t3 at offset 24, t4 at offset 32
1025       // t5 at offset 40, t6 at offset 48
1026       // t7 at offset 56
1027
1028        __ str(t0, Address(d, 1 * unit));
1029        __ stp(t1, t2, Address(d, 2 * unit));
1030        __ ldp(t0, t1, Address(s, 2 * unit));
1031        __ stp(t3, t4, Address(d, 4 * unit));
1032        __ ldp(t2, t3, Address(s, 4 * unit));
1033        __ stp(t5, t6, Address(d, 6 * unit));
1034        __ ldp(t4, t5, Address(s, 6 * unit));
1035        __ str(t7, Address(__ pre(d, 8 * unit)));
1036        __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1037      } else {
1038       // d was not offset when we started so the registers are
1039       // written into the 64 bit block preceding d with the following
1040       // offsets
1041       //
1042       // t1 at offset -8
1043       // t3 at offset -24, t0 at offset -16
1044       // t5 at offset -48, t2 at offset -32
1045       // t7 at offset -56, t4 at offset -48
1046       //                   t6 at offset -64
1047       //
1048       // note that this matches the offsets previously noted for the
1049       // loads
1050
1051        __ str(t1, Address(d, 1 * unit));
1052        __ stp(t3, t0, Address(d, 3 * unit));
1053        __ ldp(t0, t1, Address(s, 2 * unit));
1054        __ stp(t5, t2, Address(d, 5 * unit));
1055        __ ldp(t2, t3, Address(s, 4 * unit));
1056        __ stp(t7, t4, Address(d, 7 * unit));
1057        __ ldp(t4, t5, Address(s, 6 * unit));
1058        __ str(t6, Address(__ pre(d, 8 * unit)));
1059        __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1060      }
1061
1062      __ subs(count, count, 8);
1063      __ br(Assembler::HS, again);
1064
1065      // Drain
1066      //
1067      // this uses the same pattern of offsets and register arguments
1068      // as above
1069      __ bind(drain);
1070      if (direction == copy_forwards) {
1071        __ str(t0, Address(d, 1 * unit));
1072        __ stp(t1, t2, Address(d, 2 * unit));
1073        __ stp(t3, t4, Address(d, 4 * unit));
1074        __ stp(t5, t6, Address(d, 6 * unit));
1075        __ str(t7, Address(__ pre(d, 8 * unit)));
1076      } else {
1077        __ str(t1, Address(d, 1 * unit));
1078        __ stp(t3, t0, Address(d, 3 * unit));
1079        __ stp(t5, t2, Address(d, 5 * unit));
1080        __ stp(t7, t4, Address(d, 7 * unit));
1081        __ str(t6, Address(__ pre(d, 8 * unit)));
1082      }
1083      // now we need to copy any remaining part block which may
1084      // include a 4 word block subblock and/or a 2 word subblock.
1085      // bits 2 and 1 in the count are the tell-tale for whetehr we
1086      // have each such subblock
1087      {
1088        Label L1, L2;
1089        __ tbz(count, exact_log2(4), L1);
1090       // this is the same as above but copying only 4 longs hence
1091       // with ony one intervening stp between the str instructions
1092       // but note that the offsets and registers still follow the
1093       // same pattern
1094        __ ldp(t0, t1, Address(s, 2 * unit));
1095        __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1096        if (direction == copy_forwards) {
1097          __ str(t0, Address(d, 1 * unit));
1098          __ stp(t1, t2, Address(d, 2 * unit));
1099          __ str(t3, Address(__ pre(d, 4 * unit)));
1100        } else {
1101          __ str(t1, Address(d, 1 * unit));
1102          __ stp(t3, t0, Address(d, 3 * unit));
1103          __ str(t2, Address(__ pre(d, 4 * unit)));
1104        }
1105        __ bind(L1);
1106
1107        __ tbz(count, 1, L2);
1108       // this is the same as above but copying only 2 longs hence
1109       // there is no intervening stp between the str instructions
1110       // but note that the offset and register patterns are still
1111       // the same
1112        __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1113        if (direction == copy_forwards) {
1114          __ str(t0, Address(d, 1 * unit));
1115          __ str(t1, Address(__ pre(d, 2 * unit)));
1116        } else {
1117          __ str(t1, Address(d, 1 * unit));
1118          __ str(t0, Address(__ pre(d, 2 * unit)));
1119        }
1120        __ bind(L2);
1121
1122       // for forwards copy we need to re-adjust the offsets we
1123       // applied so that s and d are follow the last words written
1124
1125       if (direction == copy_forwards) {
1126         __ add(s, s, 16);
1127         __ add(d, d, 8);
1128       }
1129
1130      }
1131
1132      __ ret(lr);
1133      }
1134  }
1135
1136  // Small copy: less than 16 bytes.
1137  //
1138  // NB: Ignores all of the bits of count which represent more than 15
1139  // bytes, so a caller doesn't have to mask them.
1140
1141  void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1142    bool is_backwards = step < 0;
1143    size_t granularity = uabs(step);
1144    int direction = is_backwards ? -1 : 1;
1145    int unit = wordSize * direction;
1146
1147    Label Lpair, Lword, Lint, Lshort, Lbyte;
1148
1149    assert(granularity
1150           && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1151
1152    const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1153
1154    // ??? I don't know if this bit-test-and-branch is the right thing
1155    // to do.  It does a lot of jumping, resulting in several
1156    // mispredicted branches.  It might make more sense to do this
1157    // with something like Duff's device with a single computed branch.
1158
1159    __ tbz(count, 3 - exact_log2(granularity), Lword);
1160    __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1161    __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1162    __ bind(Lword);
1163
1164    if (granularity <= sizeof (jint)) {
1165      __ tbz(count, 2 - exact_log2(granularity), Lint);
1166      __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1167      __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1168      __ bind(Lint);
1169    }
1170
1171    if (granularity <= sizeof (jshort)) {
1172      __ tbz(count, 1 - exact_log2(granularity), Lshort);
1173      __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1174      __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1175      __ bind(Lshort);
1176    }
1177
1178    if (granularity <= sizeof (jbyte)) {
1179      __ tbz(count, 0, Lbyte);
1180      __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1181      __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1182      __ bind(Lbyte);
1183    }
1184  }
1185
1186  Label copy_f, copy_b;
1187
1188  // All-singing all-dancing memory copy.
1189  //
1190  // Copy count units of memory from s to d.  The size of a unit is
1191  // step, which can be positive or negative depending on the direction
1192  // of copy.  If is_aligned is false, we align the source address.
1193  //
1194
1195  void copy_memory(bool is_aligned, Register s, Register d,
1196                   Register count, Register tmp, int step) {
1197    copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1198    bool is_backwards = step < 0;
1199    int granularity = uabs(step);
1200    const Register t0 = r3, t1 = r4;
1201
1202    // <= 96 bytes do inline. Direction doesn't matter because we always
1203    // load all the data before writing anything
1204    Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1205    const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1206    const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1207    const Register send = r17, dend = r18;
1208
1209    if (PrefetchCopyIntervalInBytes > 0)
1210      __ prfm(Address(s, 0), PLDL1KEEP);
1211    __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1212    __ br(Assembler::HI, copy_big);
1213
1214    __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1215    __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1216
1217    __ cmp(count, 16/granularity);
1218    __ br(Assembler::LS, copy16);
1219
1220    __ cmp(count, 64/granularity);
1221    __ br(Assembler::HI, copy80);
1222
1223    __ cmp(count, 32/granularity);
1224    __ br(Assembler::LS, copy32);
1225
1226    // 33..64 bytes
1227    if (UseSIMDForMemoryOps) {
1228      __ ldpq(v0, v1, Address(s, 0));
1229      __ ldpq(v2, v3, Address(send, -32));
1230      __ stpq(v0, v1, Address(d, 0));
1231      __ stpq(v2, v3, Address(dend, -32));
1232    } else {
1233      __ ldp(t0, t1, Address(s, 0));
1234      __ ldp(t2, t3, Address(s, 16));
1235      __ ldp(t4, t5, Address(send, -32));
1236      __ ldp(t6, t7, Address(send, -16));
1237
1238      __ stp(t0, t1, Address(d, 0));
1239      __ stp(t2, t3, Address(d, 16));
1240      __ stp(t4, t5, Address(dend, -32));
1241      __ stp(t6, t7, Address(dend, -16));
1242    }
1243    __ b(finish);
1244
1245    // 17..32 bytes
1246    __ bind(copy32);
1247    __ ldp(t0, t1, Address(s, 0));
1248    __ ldp(t2, t3, Address(send, -16));
1249    __ stp(t0, t1, Address(d, 0));
1250    __ stp(t2, t3, Address(dend, -16));
1251    __ b(finish);
1252
1253    // 65..80/96 bytes
1254    // (96 bytes if SIMD because we do 32 byes per instruction)
1255    __ bind(copy80);
1256    if (UseSIMDForMemoryOps) {
1257      __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1258      __ ldpq(v4, v5, Address(send, -32));
1259      __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1260      __ stpq(v4, v5, Address(dend, -32));
1261    } else {
1262      __ ldp(t0, t1, Address(s, 0));
1263      __ ldp(t2, t3, Address(s, 16));
1264      __ ldp(t4, t5, Address(s, 32));
1265      __ ldp(t6, t7, Address(s, 48));
1266      __ ldp(t8, t9, Address(send, -16));
1267
1268      __ stp(t0, t1, Address(d, 0));
1269      __ stp(t2, t3, Address(d, 16));
1270      __ stp(t4, t5, Address(d, 32));
1271      __ stp(t6, t7, Address(d, 48));
1272      __ stp(t8, t9, Address(dend, -16));
1273    }
1274    __ b(finish);
1275
1276    // 0..16 bytes
1277    __ bind(copy16);
1278    __ cmp(count, 8/granularity);
1279    __ br(Assembler::LO, copy8);
1280
1281    // 8..16 bytes
1282    __ ldr(t0, Address(s, 0));
1283    __ ldr(t1, Address(send, -8));
1284    __ str(t0, Address(d, 0));
1285    __ str(t1, Address(dend, -8));
1286    __ b(finish);
1287
1288    if (granularity < 8) {
1289      // 4..7 bytes
1290      __ bind(copy8);
1291      __ tbz(count, 2 - exact_log2(granularity), copy4);
1292      __ ldrw(t0, Address(s, 0));
1293      __ ldrw(t1, Address(send, -4));
1294      __ strw(t0, Address(d, 0));
1295      __ strw(t1, Address(dend, -4));
1296      __ b(finish);
1297      if (granularity < 4) {
1298        // 0..3 bytes
1299        __ bind(copy4);
1300        __ cbz(count, finish); // get rid of 0 case
1301        if (granularity == 2) {
1302          __ ldrh(t0, Address(s, 0));
1303          __ strh(t0, Address(d, 0));
1304        } else { // granularity == 1
1305          // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1306          // the first and last byte.
1307          // Handle the 3 byte case by loading and storing base + count/2
1308          // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1309          // This does means in the 1 byte case we load/store the same
1310          // byte 3 times.
1311          __ lsr(count, count, 1);
1312          __ ldrb(t0, Address(s, 0));
1313          __ ldrb(t1, Address(send, -1));
1314          __ ldrb(t2, Address(s, count));
1315          __ strb(t0, Address(d, 0));
1316          __ strb(t1, Address(dend, -1));
1317          __ strb(t2, Address(d, count));
1318        }
1319        __ b(finish);
1320      }
1321    }
1322
1323    __ bind(copy_big);
1324    if (is_backwards) {
1325      __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1326      __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1327    }
1328
1329    // Now we've got the small case out of the way we can align the
1330    // source address on a 2-word boundary.
1331
1332    Label aligned;
1333
1334    if (is_aligned) {
1335      // We may have to adjust by 1 word to get s 2-word-aligned.
1336      __ tbz(s, exact_log2(wordSize), aligned);
1337      __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1338      __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1339      __ sub(count, count, wordSize/granularity);
1340    } else {
1341      if (is_backwards) {
1342        __ andr(rscratch2, s, 2 * wordSize - 1);
1343      } else {
1344        __ neg(rscratch2, s);
1345        __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1346      }
1347      // rscratch2 is the byte adjustment needed to align s.
1348      __ cbz(rscratch2, aligned);
1349      int shift = exact_log2(granularity);
1350      if (shift)  __ lsr(rscratch2, rscratch2, shift);
1351      __ sub(count, count, rscratch2);
1352
1353#if 0
1354      // ?? This code is only correct for a disjoint copy.  It may or
1355      // may not make sense to use it in that case.
1356
1357      // Copy the first pair; s and d may not be aligned.
1358      __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1359      __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1360
1361      // Align s and d, adjust count
1362      if (is_backwards) {
1363        __ sub(s, s, rscratch2);
1364        __ sub(d, d, rscratch2);
1365      } else {
1366        __ add(s, s, rscratch2);
1367        __ add(d, d, rscratch2);
1368      }
1369#else
1370      copy_memory_small(s, d, rscratch2, rscratch1, step);
1371#endif
1372    }
1373
1374    __ bind(aligned);
1375
1376    // s is now 2-word-aligned.
1377
1378    // We have a count of units and some trailing bytes.  Adjust the
1379    // count and do a bulk copy of words.
1380    __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1381    if (direction == copy_forwards)
1382      __ bl(copy_f);
1383    else
1384      __ bl(copy_b);
1385
1386    // And the tail.
1387    copy_memory_small(s, d, count, tmp, step);
1388
1389    if (granularity >= 8) __ bind(copy8);
1390    if (granularity >= 4) __ bind(copy4);
1391    __ bind(finish);
1392  }
1393
1394
1395  void clobber_registers() {
1396#ifdef ASSERT
1397    __ mov(rscratch1, (uint64_t)0xdeadbeef);
1398    __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1399    for (Register r = r3; r <= r18; r++)
1400      if (r != rscratch1) __ mov(r, rscratch1);
1401#endif
1402  }
1403
1404  // Scan over array at a for count oops, verifying each one.
1405  // Preserves a and count, clobbers rscratch1 and rscratch2.
1406  void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1407    Label loop, end;
1408    __ mov(rscratch1, a);
1409    __ mov(rscratch2, zr);
1410    __ bind(loop);
1411    __ cmp(rscratch2, count);
1412    __ br(Assembler::HS, end);
1413    if (size == (size_t)wordSize) {
1414      __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1415      __ verify_oop(temp);
1416    } else {
1417      __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1418      __ decode_heap_oop(temp); // calls verify_oop
1419    }
1420    __ add(rscratch2, rscratch2, size);
1421    __ b(loop);
1422    __ bind(end);
1423  }
1424
1425  // Arguments:
1426  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1427  //             ignored
1428  //   is_oop  - true => oop array, so generate store check code
1429  //   name    - stub name string
1430  //
1431  // Inputs:
1432  //   c_rarg0   - source array address
1433  //   c_rarg1   - destination array address
1434  //   c_rarg2   - element count, treated as ssize_t, can be zero
1435  //
1436  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1437  // the hardware handle it.  The two dwords within qwords that span
1438  // cache line boundaries will still be loaded and stored atomicly.
1439  //
1440  // Side Effects:
1441  //   disjoint_int_copy_entry is set to the no-overlap entry point
1442  //   used by generate_conjoint_int_oop_copy().
1443  //
1444  address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1445                                  const char *name, bool dest_uninitialized = false) {
1446    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1447    RegSet saved_reg = RegSet::of(s, d, count);
1448    __ align(CodeEntryAlignment);
1449    StubCodeMark mark(this, "StubRoutines", name);
1450    address start = __ pc();
1451    __ enter();
1452
1453    if (entry != NULL) {
1454      *entry = __ pc();
1455      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1456      BLOCK_COMMENT("Entry:");
1457    }
1458
1459    if (is_oop) {
1460      gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_reg);
1461      // save regs before copy_memory
1462      __ push(RegSet::of(d, count), sp);
1463    }
1464    copy_memory(aligned, s, d, count, rscratch1, size);
1465    if (is_oop) {
1466      __ pop(RegSet::of(d, count), sp);
1467      if (VerifyOops)
1468        verify_oop_array(size, d, count, r16);
1469      __ sub(count, count, 1); // make an inclusive end pointer
1470      __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1471      gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet());
1472    }
1473    __ leave();
1474    __ mov(r0, zr); // return 0
1475    __ ret(lr);
1476#ifdef BUILTIN_SIM
1477    {
1478      AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1479      sim->notifyCompile(const_cast<char*>(name), start);
1480    }
1481#endif
1482    return start;
1483  }
1484
1485  // Arguments:
1486  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1487  //             ignored
1488  //   is_oop  - true => oop array, so generate store check code
1489  //   name    - stub name string
1490  //
1491  // Inputs:
1492  //   c_rarg0   - source array address
1493  //   c_rarg1   - destination array address
1494  //   c_rarg2   - element count, treated as ssize_t, can be zero
1495  //
1496  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1497  // the hardware handle it.  The two dwords within qwords that span
1498  // cache line boundaries will still be loaded and stored atomicly.
1499  //
1500  address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1501                                 address *entry, const char *name,
1502                                 bool dest_uninitialized = false) {
1503    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1504    RegSet saved_regs = RegSet::of(s, d, count);
1505    StubCodeMark mark(this, "StubRoutines", name);
1506    address start = __ pc();
1507    __ enter();
1508
1509    if (entry != NULL) {
1510      *entry = __ pc();
1511      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1512      BLOCK_COMMENT("Entry:");
1513    }
1514
1515    // use fwd copy when (d-s) above_equal (count*size)
1516    __ sub(rscratch1, d, s);
1517    __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1518    __ br(Assembler::HS, nooverlap_target);
1519
1520    if (is_oop) {
1521      gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_regs);
1522      // save regs before copy_memory
1523      __ push(RegSet::of(d, count), sp);
1524    }
1525    copy_memory(aligned, s, d, count, rscratch1, -size);
1526    if (is_oop) {
1527      __ pop(RegSet::of(d, count), sp);
1528      if (VerifyOops)
1529        verify_oop_array(size, d, count, r16);
1530      __ sub(count, count, 1); // make an inclusive end pointer
1531      __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1532      gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet());
1533    }
1534    __ leave();
1535    __ mov(r0, zr); // return 0
1536    __ ret(lr);
1537#ifdef BUILTIN_SIM
1538    {
1539      AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1540      sim->notifyCompile(const_cast<char*>(name), start);
1541    }
1542#endif
1543    return start;
1544}
1545
1546  // Arguments:
1547  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1548  //             ignored
1549  //   name    - stub name string
1550  //
1551  // Inputs:
1552  //   c_rarg0   - source array address
1553  //   c_rarg1   - destination array address
1554  //   c_rarg2   - element count, treated as ssize_t, can be zero
1555  //
1556  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1557  // we let the hardware handle it.  The one to eight bytes within words,
1558  // dwords or qwords that span cache line boundaries will still be loaded
1559  // and stored atomically.
1560  //
1561  // Side Effects:
1562  //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1563  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1564  // we let the hardware handle it.  The one to eight bytes within words,
1565  // dwords or qwords that span cache line boundaries will still be loaded
1566  // and stored atomically.
1567  //
1568  // Side Effects:
1569  //   disjoint_byte_copy_entry is set to the no-overlap entry point
1570  //   used by generate_conjoint_byte_copy().
1571  //
1572  address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1573    const bool not_oop = false;
1574    return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1575  }
1576
1577  // Arguments:
1578  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1579  //             ignored
1580  //   name    - stub name string
1581  //
1582  // Inputs:
1583  //   c_rarg0   - source array address
1584  //   c_rarg1   - destination array address
1585  //   c_rarg2   - element count, treated as ssize_t, can be zero
1586  //
1587  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1588  // we let the hardware handle it.  The one to eight bytes within words,
1589  // dwords or qwords that span cache line boundaries will still be loaded
1590  // and stored atomically.
1591  //
1592  address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1593                                      address* entry, const char *name) {
1594    const bool not_oop = false;
1595    return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1596  }
1597
1598  // Arguments:
1599  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1600  //             ignored
1601  //   name    - stub name string
1602  //
1603  // Inputs:
1604  //   c_rarg0   - source array address
1605  //   c_rarg1   - destination array address
1606  //   c_rarg2   - element count, treated as ssize_t, can be zero
1607  //
1608  // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1609  // let the hardware handle it.  The two or four words within dwords
1610  // or qwords that span cache line boundaries will still be loaded
1611  // and stored atomically.
1612  //
1613  // Side Effects:
1614  //   disjoint_short_copy_entry is set to the no-overlap entry point
1615  //   used by generate_conjoint_short_copy().
1616  //
1617  address generate_disjoint_short_copy(bool aligned,
1618                                       address* entry, const char *name) {
1619    const bool not_oop = false;
1620    return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1621  }
1622
1623  // Arguments:
1624  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1625  //             ignored
1626  //   name    - stub name string
1627  //
1628  // Inputs:
1629  //   c_rarg0   - source array address
1630  //   c_rarg1   - destination array address
1631  //   c_rarg2   - element count, treated as ssize_t, can be zero
1632  //
1633  // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1634  // let the hardware handle it.  The two or four words within dwords
1635  // or qwords that span cache line boundaries will still be loaded
1636  // and stored atomically.
1637  //
1638  address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1639                                       address *entry, const char *name) {
1640    const bool not_oop = false;
1641    return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1642
1643  }
1644  // Arguments:
1645  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1646  //             ignored
1647  //   name    - stub name string
1648  //
1649  // Inputs:
1650  //   c_rarg0   - source array address
1651  //   c_rarg1   - destination array address
1652  //   c_rarg2   - element count, treated as ssize_t, can be zero
1653  //
1654  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1655  // the hardware handle it.  The two dwords within qwords that span
1656  // cache line boundaries will still be loaded and stored atomicly.
1657  //
1658  // Side Effects:
1659  //   disjoint_int_copy_entry is set to the no-overlap entry point
1660  //   used by generate_conjoint_int_oop_copy().
1661  //
1662  address generate_disjoint_int_copy(bool aligned, address *entry,
1663                                         const char *name, bool dest_uninitialized = false) {
1664    const bool not_oop = false;
1665    return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1666  }
1667
1668  // Arguments:
1669  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1670  //             ignored
1671  //   name    - stub name string
1672  //
1673  // Inputs:
1674  //   c_rarg0   - source array address
1675  //   c_rarg1   - destination array address
1676  //   c_rarg2   - element count, treated as ssize_t, can be zero
1677  //
1678  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1679  // the hardware handle it.  The two dwords within qwords that span
1680  // cache line boundaries will still be loaded and stored atomicly.
1681  //
1682  address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1683                                     address *entry, const char *name,
1684                                     bool dest_uninitialized = false) {
1685    const bool not_oop = false;
1686    return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1687  }
1688
1689
1690  // Arguments:
1691  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1692  //             ignored
1693  //   name    - stub name string
1694  //
1695  // Inputs:
1696  //   c_rarg0   - source array address
1697  //   c_rarg1   - destination array address
1698  //   c_rarg2   - element count, treated as size_t, can be zero
1699  //
1700  // Side Effects:
1701  //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1702  //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1703  //
1704  address generate_disjoint_long_copy(bool aligned, address *entry,
1705                                          const char *name, bool dest_uninitialized = false) {
1706    const bool not_oop = false;
1707    return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1708  }
1709
1710  // Arguments:
1711  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1712  //             ignored
1713  //   name    - stub name string
1714  //
1715  // Inputs:
1716  //   c_rarg0   - source array address
1717  //   c_rarg1   - destination array address
1718  //   c_rarg2   - element count, treated as size_t, can be zero
1719  //
1720  address generate_conjoint_long_copy(bool aligned,
1721                                      address nooverlap_target, address *entry,
1722                                      const char *name, bool dest_uninitialized = false) {
1723    const bool not_oop = false;
1724    return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1725  }
1726
1727  // Arguments:
1728  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1729  //             ignored
1730  //   name    - stub name string
1731  //
1732  // Inputs:
1733  //   c_rarg0   - source array address
1734  //   c_rarg1   - destination array address
1735  //   c_rarg2   - element count, treated as size_t, can be zero
1736  //
1737  // Side Effects:
1738  //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1739  //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1740  //
1741  address generate_disjoint_oop_copy(bool aligned, address *entry,
1742                                     const char *name, bool dest_uninitialized) {
1743    const bool is_oop = true;
1744    const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1745    return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1746  }
1747
1748  // Arguments:
1749  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1750  //             ignored
1751  //   name    - stub name string
1752  //
1753  // Inputs:
1754  //   c_rarg0   - source array address
1755  //   c_rarg1   - destination array address
1756  //   c_rarg2   - element count, treated as size_t, can be zero
1757  //
1758  address generate_conjoint_oop_copy(bool aligned,
1759                                     address nooverlap_target, address *entry,
1760                                     const char *name, bool dest_uninitialized) {
1761    const bool is_oop = true;
1762    const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1763    return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1764                                  name, dest_uninitialized);
1765  }
1766
1767
1768  // Helper for generating a dynamic type check.
1769  // Smashes rscratch1.
1770  void generate_type_check(Register sub_klass,
1771                           Register super_check_offset,
1772                           Register super_klass,
1773                           Label& L_success) {
1774    assert_different_registers(sub_klass, super_check_offset, super_klass);
1775
1776    BLOCK_COMMENT("type_check:");
1777
1778    Label L_miss;
1779
1780    __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1781                                     super_check_offset);
1782    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1783
1784    // Fall through on failure!
1785    __ BIND(L_miss);
1786  }
1787
1788  //
1789  //  Generate checkcasting array copy stub
1790  //
1791  //  Input:
1792  //    c_rarg0   - source array address
1793  //    c_rarg1   - destination array address
1794  //    c_rarg2   - element count, treated as ssize_t, can be zero
1795  //    c_rarg3   - size_t ckoff (super_check_offset)
1796  //    c_rarg4   - oop ckval (super_klass)
1797  //
1798  //  Output:
1799  //    r0 ==  0  -  success
1800  //    r0 == -1^K - failure, where K is partial transfer count
1801  //
1802  address generate_checkcast_copy(const char *name, address *entry,
1803                                  bool dest_uninitialized = false) {
1804
1805    Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1806
1807    // Input registers (after setup_arg_regs)
1808    const Register from        = c_rarg0;   // source array address
1809    const Register to          = c_rarg1;   // destination array address
1810    const Register count       = c_rarg2;   // elementscount
1811    const Register ckoff       = c_rarg3;   // super_check_offset
1812    const Register ckval       = c_rarg4;   // super_klass
1813
1814    RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1815    RegSet wb_post_saved_regs = RegSet::of(count);
1816
1817    // Registers used as temps (r18, r19, r20 are save-on-entry)
1818    const Register count_save  = r21;       // orig elementscount
1819    const Register start_to    = r20;       // destination array start address
1820    const Register copied_oop  = r18;       // actual oop copied
1821    const Register r19_klass   = r19;       // oop._klass
1822
1823    //---------------------------------------------------------------
1824    // Assembler stub will be used for this call to arraycopy
1825    // if the two arrays are subtypes of Object[] but the
1826    // destination array type is not equal to or a supertype
1827    // of the source type.  Each element must be separately
1828    // checked.
1829
1830    assert_different_registers(from, to, count, ckoff, ckval, start_to,
1831                               copied_oop, r19_klass, count_save);
1832
1833    __ align(CodeEntryAlignment);
1834    StubCodeMark mark(this, "StubRoutines", name);
1835    address start = __ pc();
1836
1837    __ enter(); // required for proper stackwalking of RuntimeStub frame
1838
1839#ifdef ASSERT
1840    // caller guarantees that the arrays really are different
1841    // otherwise, we would have to make conjoint checks
1842    { Label L;
1843      array_overlap_test(L, TIMES_OOP);
1844      __ stop("checkcast_copy within a single array");
1845      __ bind(L);
1846    }
1847#endif //ASSERT
1848
1849    // Caller of this entry point must set up the argument registers.
1850    if (entry != NULL) {
1851      *entry = __ pc();
1852      BLOCK_COMMENT("Entry:");
1853    }
1854
1855     // Empty array:  Nothing to do.
1856    __ cbz(count, L_done);
1857
1858    __ push(RegSet::of(r18, r19, r20, r21), sp);
1859
1860#ifdef ASSERT
1861    BLOCK_COMMENT("assert consistent ckoff/ckval");
1862    // The ckoff and ckval must be mutually consistent,
1863    // even though caller generates both.
1864    { Label L;
1865      int sco_offset = in_bytes(Klass::super_check_offset_offset());
1866      __ ldrw(start_to, Address(ckval, sco_offset));
1867      __ cmpw(ckoff, start_to);
1868      __ br(Assembler::EQ, L);
1869      __ stop("super_check_offset inconsistent");
1870      __ bind(L);
1871    }
1872#endif //ASSERT
1873
1874    gen_write_ref_array_pre_barrier(to, count, dest_uninitialized, wb_pre_saved_regs);
1875
1876    // save the original count
1877    __ mov(count_save, count);
1878
1879    // Copy from low to high addresses
1880    __ mov(start_to, to);              // Save destination array start address
1881    __ b(L_load_element);
1882
1883    // ======== begin loop ========
1884    // (Loop is rotated; its entry is L_load_element.)
1885    // Loop control:
1886    //   for (; count != 0; count--) {
1887    //     copied_oop = load_heap_oop(from++);
1888    //     ... generate_type_check ...;
1889    //     store_heap_oop(to++, copied_oop);
1890    //   }
1891    __ align(OptoLoopAlignment);
1892
1893    __ BIND(L_store_element);
1894    __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1895    __ sub(count, count, 1);
1896    __ cbz(count, L_do_card_marks);
1897
1898    // ======== loop entry is here ========
1899    __ BIND(L_load_element);
1900    __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1901    __ cbz(copied_oop, L_store_element);
1902
1903    __ load_klass(r19_klass, copied_oop);// query the object klass
1904    generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1905    // ======== end loop ========
1906
1907    // It was a real error; we must depend on the caller to finish the job.
1908    // Register count = remaining oops, count_orig = total oops.
1909    // Emit GC store barriers for the oops we have copied and report
1910    // their number to the caller.
1911
1912    __ subs(count, count_save, count);     // K = partially copied oop count
1913    __ eon(count, count, zr);                   // report (-1^K) to caller
1914    __ br(Assembler::EQ, L_done_pop);
1915
1916    __ BIND(L_do_card_marks);
1917    __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1918    gen_write_ref_array_post_barrier(start_to, to, rscratch1, wb_post_saved_regs);
1919
1920    __ bind(L_done_pop);
1921    __ pop(RegSet::of(r18, r19, r20, r21), sp);
1922    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1923
1924    __ bind(L_done);
1925    __ mov(r0, count);
1926    __ leave();
1927    __ ret(lr);
1928
1929    return start;
1930  }
1931
1932  // Perform range checks on the proposed arraycopy.
1933  // Kills temp, but nothing else.
1934  // Also, clean the sign bits of src_pos and dst_pos.
1935  void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1936                              Register src_pos, // source position (c_rarg1)
1937                              Register dst,     // destination array oo (c_rarg2)
1938                              Register dst_pos, // destination position (c_rarg3)
1939                              Register length,
1940                              Register temp,
1941                              Label& L_failed) {
1942    BLOCK_COMMENT("arraycopy_range_checks:");
1943
1944    assert_different_registers(rscratch1, temp);
1945
1946    //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1947    __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1948    __ addw(temp, length, src_pos);
1949    __ cmpw(temp, rscratch1);
1950    __ br(Assembler::HI, L_failed);
1951
1952    //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1953    __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1954    __ addw(temp, length, dst_pos);
1955    __ cmpw(temp, rscratch1);
1956    __ br(Assembler::HI, L_failed);
1957
1958    // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1959    __ movw(src_pos, src_pos);
1960    __ movw(dst_pos, dst_pos);
1961
1962    BLOCK_COMMENT("arraycopy_range_checks done");
1963  }
1964
1965  // These stubs get called from some dumb test routine.
1966  // I'll write them properly when they're called from
1967  // something that's actually doing something.
1968  static void fake_arraycopy_stub(address src, address dst, int count) {
1969    assert(count == 0, "huh?");
1970  }
1971
1972
1973  //
1974  //  Generate 'unsafe' array copy stub
1975  //  Though just as safe as the other stubs, it takes an unscaled
1976  //  size_t argument instead of an element count.
1977  //
1978  //  Input:
1979  //    c_rarg0   - source array address
1980  //    c_rarg1   - destination array address
1981  //    c_rarg2   - byte count, treated as ssize_t, can be zero
1982  //
1983  // Examines the alignment of the operands and dispatches
1984  // to a long, int, short, or byte copy loop.
1985  //
1986  address generate_unsafe_copy(const char *name,
1987                               address byte_copy_entry,
1988                               address short_copy_entry,
1989                               address int_copy_entry,
1990                               address long_copy_entry) {
1991    Label L_long_aligned, L_int_aligned, L_short_aligned;
1992    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1993
1994    __ align(CodeEntryAlignment);
1995    StubCodeMark mark(this, "StubRoutines", name);
1996    address start = __ pc();
1997    __ enter(); // required for proper stackwalking of RuntimeStub frame
1998
1999    // bump this on entry, not on exit:
2000    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2001
2002    __ orr(rscratch1, s, d);
2003    __ orr(rscratch1, rscratch1, count);
2004
2005    __ andr(rscratch1, rscratch1, BytesPerLong-1);
2006    __ cbz(rscratch1, L_long_aligned);
2007    __ andr(rscratch1, rscratch1, BytesPerInt-1);
2008    __ cbz(rscratch1, L_int_aligned);
2009    __ tbz(rscratch1, 0, L_short_aligned);
2010    __ b(RuntimeAddress(byte_copy_entry));
2011
2012    __ BIND(L_short_aligned);
2013    __ lsr(count, count, LogBytesPerShort);  // size => short_count
2014    __ b(RuntimeAddress(short_copy_entry));
2015    __ BIND(L_int_aligned);
2016    __ lsr(count, count, LogBytesPerInt);    // size => int_count
2017    __ b(RuntimeAddress(int_copy_entry));
2018    __ BIND(L_long_aligned);
2019    __ lsr(count, count, LogBytesPerLong);   // size => long_count
2020    __ b(RuntimeAddress(long_copy_entry));
2021
2022    return start;
2023  }
2024
2025  //
2026  //  Generate generic array copy stubs
2027  //
2028  //  Input:
2029  //    c_rarg0    -  src oop
2030  //    c_rarg1    -  src_pos (32-bits)
2031  //    c_rarg2    -  dst oop
2032  //    c_rarg3    -  dst_pos (32-bits)
2033  //    c_rarg4    -  element count (32-bits)
2034  //
2035  //  Output:
2036  //    r0 ==  0  -  success
2037  //    r0 == -1^K - failure, where K is partial transfer count
2038  //
2039  address generate_generic_copy(const char *name,
2040                                address byte_copy_entry, address short_copy_entry,
2041                                address int_copy_entry, address oop_copy_entry,
2042                                address long_copy_entry, address checkcast_copy_entry) {
2043
2044    Label L_failed, L_failed_0, L_objArray;
2045    Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2046
2047    // Input registers
2048    const Register src        = c_rarg0;  // source array oop
2049    const Register src_pos    = c_rarg1;  // source position
2050    const Register dst        = c_rarg2;  // destination array oop
2051    const Register dst_pos    = c_rarg3;  // destination position
2052    const Register length     = c_rarg4;
2053
2054    StubCodeMark mark(this, "StubRoutines", name);
2055
2056    __ align(CodeEntryAlignment);
2057    address start = __ pc();
2058
2059    __ enter(); // required for proper stackwalking of RuntimeStub frame
2060
2061    // bump this on entry, not on exit:
2062    inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2063
2064    //-----------------------------------------------------------------------
2065    // Assembler stub will be used for this call to arraycopy
2066    // if the following conditions are met:
2067    //
2068    // (1) src and dst must not be null.
2069    // (2) src_pos must not be negative.
2070    // (3) dst_pos must not be negative.
2071    // (4) length  must not be negative.
2072    // (5) src klass and dst klass should be the same and not NULL.
2073    // (6) src and dst should be arrays.
2074    // (7) src_pos + length must not exceed length of src.
2075    // (8) dst_pos + length must not exceed length of dst.
2076    //
2077
2078    //  if (src == NULL) return -1;
2079    __ cbz(src, L_failed);
2080
2081    //  if (src_pos < 0) return -1;
2082    __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2083
2084    //  if (dst == NULL) return -1;
2085    __ cbz(dst, L_failed);
2086
2087    //  if (dst_pos < 0) return -1;
2088    __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2089
2090    // registers used as temp
2091    const Register scratch_length    = r16; // elements count to copy
2092    const Register scratch_src_klass = r17; // array klass
2093    const Register lh                = r18; // layout helper
2094
2095    //  if (length < 0) return -1;
2096    __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2097    __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2098
2099    __ load_klass(scratch_src_klass, src);
2100#ifdef ASSERT
2101    //  assert(src->klass() != NULL);
2102    {
2103      BLOCK_COMMENT("assert klasses not null {");
2104      Label L1, L2;
2105      __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2106      __ bind(L1);
2107      __ stop("broken null klass");
2108      __ bind(L2);
2109      __ load_klass(rscratch1, dst);
2110      __ cbz(rscratch1, L1);     // this would be broken also
2111      BLOCK_COMMENT("} assert klasses not null done");
2112    }
2113#endif
2114
2115    // Load layout helper (32-bits)
2116    //
2117    //  |array_tag|     | header_size | element_type |     |log2_element_size|
2118    // 32        30    24            16              8     2                 0
2119    //
2120    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2121    //
2122
2123    const int lh_offset = in_bytes(Klass::layout_helper_offset());
2124
2125    // Handle objArrays completely differently...
2126    const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2127    __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2128    __ movw(rscratch1, objArray_lh);
2129    __ eorw(rscratch2, lh, rscratch1);
2130    __ cbzw(rscratch2, L_objArray);
2131
2132    //  if (src->klass() != dst->klass()) return -1;
2133    __ load_klass(rscratch2, dst);
2134    __ eor(rscratch2, rscratch2, scratch_src_klass);
2135    __ cbnz(rscratch2, L_failed);
2136
2137    //  if (!src->is_Array()) return -1;
2138    __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2139
2140    // At this point, it is known to be a typeArray (array_tag 0x3).
2141#ifdef ASSERT
2142    {
2143      BLOCK_COMMENT("assert primitive array {");
2144      Label L;
2145      __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2146      __ cmpw(lh, rscratch2);
2147      __ br(Assembler::GE, L);
2148      __ stop("must be a primitive array");
2149      __ bind(L);
2150      BLOCK_COMMENT("} assert primitive array done");
2151    }
2152#endif
2153
2154    arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2155                           rscratch2, L_failed);
2156
2157    // TypeArrayKlass
2158    //
2159    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2160    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2161    //
2162
2163    const Register rscratch1_offset = rscratch1;    // array offset
2164    const Register r18_elsize = lh; // element size
2165
2166    __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2167           exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2168    __ add(src, src, rscratch1_offset);           // src array offset
2169    __ add(dst, dst, rscratch1_offset);           // dst array offset
2170    BLOCK_COMMENT("choose copy loop based on element size");
2171
2172    // next registers should be set before the jump to corresponding stub
2173    const Register from     = c_rarg0;  // source array address
2174    const Register to       = c_rarg1;  // destination array address
2175    const Register count    = c_rarg2;  // elements count
2176
2177    // 'from', 'to', 'count' registers should be set in such order
2178    // since they are the same as 'src', 'src_pos', 'dst'.
2179
2180    assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2181
2182    // The possible values of elsize are 0-3, i.e. exact_log2(element
2183    // size in bytes).  We do a simple bitwise binary search.
2184  __ BIND(L_copy_bytes);
2185    __ tbnz(r18_elsize, 1, L_copy_ints);
2186    __ tbnz(r18_elsize, 0, L_copy_shorts);
2187    __ lea(from, Address(src, src_pos));// src_addr
2188    __ lea(to,   Address(dst, dst_pos));// dst_addr
2189    __ movw(count, scratch_length); // length
2190    __ b(RuntimeAddress(byte_copy_entry));
2191
2192  __ BIND(L_copy_shorts);
2193    __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2194    __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2195    __ movw(count, scratch_length); // length
2196    __ b(RuntimeAddress(short_copy_entry));
2197
2198  __ BIND(L_copy_ints);
2199    __ tbnz(r18_elsize, 0, L_copy_longs);
2200    __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2201    __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2202    __ movw(count, scratch_length); // length
2203    __ b(RuntimeAddress(int_copy_entry));
2204
2205  __ BIND(L_copy_longs);
2206#ifdef ASSERT
2207    {
2208      BLOCK_COMMENT("assert long copy {");
2209      Label L;
2210      __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2211      __ cmpw(r18_elsize, LogBytesPerLong);
2212      __ br(Assembler::EQ, L);
2213      __ stop("must be long copy, but elsize is wrong");
2214      __ bind(L);
2215      BLOCK_COMMENT("} assert long copy done");
2216    }
2217#endif
2218    __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2219    __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2220    __ movw(count, scratch_length); // length
2221    __ b(RuntimeAddress(long_copy_entry));
2222
2223    // ObjArrayKlass
2224  __ BIND(L_objArray);
2225    // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2226
2227    Label L_plain_copy, L_checkcast_copy;
2228    //  test array classes for subtyping
2229    __ load_klass(r18, dst);
2230    __ cmp(scratch_src_klass, r18); // usual case is exact equality
2231    __ br(Assembler::NE, L_checkcast_copy);
2232
2233    // Identically typed arrays can be copied without element-wise checks.
2234    arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2235                           rscratch2, L_failed);
2236
2237    __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2238    __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2239    __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2240    __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2241    __ movw(count, scratch_length); // length
2242  __ BIND(L_plain_copy);
2243    __ b(RuntimeAddress(oop_copy_entry));
2244
2245  __ BIND(L_checkcast_copy);
2246    // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2247    {
2248      // Before looking at dst.length, make sure dst is also an objArray.
2249      __ ldrw(rscratch1, Address(r18, lh_offset));
2250      __ movw(rscratch2, objArray_lh);
2251      __ eorw(rscratch1, rscratch1, rscratch2);
2252      __ cbnzw(rscratch1, L_failed);
2253
2254      // It is safe to examine both src.length and dst.length.
2255      arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2256                             r18, L_failed);
2257
2258      const Register rscratch2_dst_klass = rscratch2;
2259      __ load_klass(rscratch2_dst_klass, dst); // reload
2260
2261      // Marshal the base address arguments now, freeing registers.
2262      __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2263      __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2264      __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2265      __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2266      __ movw(count, length);           // length (reloaded)
2267      Register sco_temp = c_rarg3;      // this register is free now
2268      assert_different_registers(from, to, count, sco_temp,
2269                                 rscratch2_dst_klass, scratch_src_klass);
2270      // assert_clean_int(count, sco_temp);
2271
2272      // Generate the type check.
2273      const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2274      __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2275      // assert_clean_int(sco_temp, r18);
2276      generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2277
2278      // Fetch destination element klass from the ObjArrayKlass header.
2279      int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2280      __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2281      __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2282
2283      // the checkcast_copy loop needs two extra arguments:
2284      assert(c_rarg3 == sco_temp, "#3 already in place");
2285      // Set up arguments for checkcast_copy_entry.
2286      __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2287      __ b(RuntimeAddress(checkcast_copy_entry));
2288    }
2289
2290  __ BIND(L_failed);
2291    __ mov(r0, -1);
2292    __ leave();   // required for proper stackwalking of RuntimeStub frame
2293    __ ret(lr);
2294
2295    return start;
2296  }
2297
2298  //
2299  // Generate stub for array fill. If "aligned" is true, the
2300  // "to" address is assumed to be heapword aligned.
2301  //
2302  // Arguments for generated stub:
2303  //   to:    c_rarg0
2304  //   value: c_rarg1
2305  //   count: c_rarg2 treated as signed
2306  //
2307  address generate_fill(BasicType t, bool aligned, const char *name) {
2308    __ align(CodeEntryAlignment);
2309    StubCodeMark mark(this, "StubRoutines", name);
2310    address start = __ pc();
2311
2312    BLOCK_COMMENT("Entry:");
2313
2314    const Register to        = c_rarg0;  // source array address
2315    const Register value     = c_rarg1;  // value
2316    const Register count     = c_rarg2;  // elements count
2317
2318    const Register bz_base = r10;        // base for block_zero routine
2319    const Register cnt_words = r11;      // temp register
2320
2321    __ enter();
2322
2323    Label L_fill_elements, L_exit1;
2324
2325    int shift = -1;
2326    switch (t) {
2327      case T_BYTE:
2328        shift = 0;
2329        __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2330        __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2331        __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2332        __ br(Assembler::LO, L_fill_elements);
2333        break;
2334      case T_SHORT:
2335        shift = 1;
2336        __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2337        __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2338        __ br(Assembler::LO, L_fill_elements);
2339        break;
2340      case T_INT:
2341        shift = 2;
2342        __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2343        __ br(Assembler::LO, L_fill_elements);
2344        break;
2345      default: ShouldNotReachHere();
2346    }
2347
2348    // Align source address at 8 bytes address boundary.
2349    Label L_skip_align1, L_skip_align2, L_skip_align4;
2350    if (!aligned) {
2351      switch (t) {
2352        case T_BYTE:
2353          // One byte misalignment happens only for byte arrays.
2354          __ tbz(to, 0, L_skip_align1);
2355          __ strb(value, Address(__ post(to, 1)));
2356          __ subw(count, count, 1);
2357          __ bind(L_skip_align1);
2358          // Fallthrough
2359        case T_SHORT:
2360          // Two bytes misalignment happens only for byte and short (char) arrays.
2361          __ tbz(to, 1, L_skip_align2);
2362          __ strh(value, Address(__ post(to, 2)));
2363          __ subw(count, count, 2 >> shift);
2364          __ bind(L_skip_align2);
2365          // Fallthrough
2366        case T_INT:
2367          // Align to 8 bytes, we know we are 4 byte aligned to start.
2368          __ tbz(to, 2, L_skip_align4);
2369          __ strw(value, Address(__ post(to, 4)));
2370          __ subw(count, count, 4 >> shift);
2371          __ bind(L_skip_align4);
2372          break;
2373        default: ShouldNotReachHere();
2374      }
2375    }
2376
2377    //
2378    //  Fill large chunks
2379    //
2380    __ lsrw(cnt_words, count, 3 - shift); // number of words
2381    __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2382    __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2383    if (UseBlockZeroing) {
2384      Label non_block_zeroing, rest;
2385      // If the fill value is zero we can use the fast zero_words().
2386      __ cbnz(value, non_block_zeroing);
2387      __ mov(bz_base, to);
2388      __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2389      __ zero_words(bz_base, cnt_words);
2390      __ b(rest);
2391      __ bind(non_block_zeroing);
2392      __ fill_words(to, cnt_words, value);
2393      __ bind(rest);
2394    } else {
2395      __ fill_words(to, cnt_words, value);
2396    }
2397
2398    // Remaining count is less than 8 bytes. Fill it by a single store.
2399    // Note that the total length is no less than 8 bytes.
2400    if (t == T_BYTE || t == T_SHORT) {
2401      Label L_exit1;
2402      __ cbzw(count, L_exit1);
2403      __ add(to, to, count, Assembler::LSL, shift); // points to the end
2404      __ str(value, Address(to, -8));    // overwrite some elements
2405      __ bind(L_exit1);
2406      __ leave();
2407      __ ret(lr);
2408    }
2409
2410    // Handle copies less than 8 bytes.
2411    Label L_fill_2, L_fill_4, L_exit2;
2412    __ bind(L_fill_elements);
2413    switch (t) {
2414      case T_BYTE:
2415        __ tbz(count, 0, L_fill_2);
2416        __ strb(value, Address(__ post(to, 1)));
2417        __ bind(L_fill_2);
2418        __ tbz(count, 1, L_fill_4);
2419        __ strh(value, Address(__ post(to, 2)));
2420        __ bind(L_fill_4);
2421        __ tbz(count, 2, L_exit2);
2422        __ strw(value, Address(to));
2423        break;
2424      case T_SHORT:
2425        __ tbz(count, 0, L_fill_4);
2426        __ strh(value, Address(__ post(to, 2)));
2427        __ bind(L_fill_4);
2428        __ tbz(count, 1, L_exit2);
2429        __ strw(value, Address(to));
2430        break;
2431      case T_INT:
2432        __ cbzw(count, L_exit2);
2433        __ strw(value, Address(to));
2434        break;
2435      default: ShouldNotReachHere();
2436    }
2437    __ bind(L_exit2);
2438    __ leave();
2439    __ ret(lr);
2440    return start;
2441  }
2442
2443  void generate_arraycopy_stubs() {
2444    address entry;
2445    address entry_jbyte_arraycopy;
2446    address entry_jshort_arraycopy;
2447    address entry_jint_arraycopy;
2448    address entry_oop_arraycopy;
2449    address entry_jlong_arraycopy;
2450    address entry_checkcast_arraycopy;
2451
2452    generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2453    generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2454
2455    StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2456
2457    //*** jbyte
2458    // Always need aligned and unaligned versions
2459    StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2460                                                                                  "jbyte_disjoint_arraycopy");
2461    StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2462                                                                                  &entry_jbyte_arraycopy,
2463                                                                                  "jbyte_arraycopy");
2464    StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2465                                                                                  "arrayof_jbyte_disjoint_arraycopy");
2466    StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2467                                                                                  "arrayof_jbyte_arraycopy");
2468
2469    //*** jshort
2470    // Always need aligned and unaligned versions
2471    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2472                                                                                    "jshort_disjoint_arraycopy");
2473    StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2474                                                                                    &entry_jshort_arraycopy,
2475                                                                                    "jshort_arraycopy");
2476    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2477                                                                                    "arrayof_jshort_disjoint_arraycopy");
2478    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2479                                                                                    "arrayof_jshort_arraycopy");
2480
2481    //*** jint
2482    // Aligned versions
2483    StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2484                                                                                "arrayof_jint_disjoint_arraycopy");
2485    StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2486                                                                                "arrayof_jint_arraycopy");
2487    // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2488    // entry_jint_arraycopy always points to the unaligned version
2489    StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2490                                                                                "jint_disjoint_arraycopy");
2491    StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2492                                                                                &entry_jint_arraycopy,
2493                                                                                "jint_arraycopy");
2494
2495    //*** jlong
2496    // It is always aligned
2497    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2498                                                                                  "arrayof_jlong_disjoint_arraycopy");
2499    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2500                                                                                  "arrayof_jlong_arraycopy");
2501    StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2502    StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2503
2504    //*** oops
2505    {
2506      // With compressed oops we need unaligned versions; notice that
2507      // we overwrite entry_oop_arraycopy.
2508      bool aligned = !UseCompressedOops;
2509
2510      StubRoutines::_arrayof_oop_disjoint_arraycopy
2511        = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2512                                     /*dest_uninitialized*/false);
2513      StubRoutines::_arrayof_oop_arraycopy
2514        = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2515                                     /*dest_uninitialized*/false);
2516      // Aligned versions without pre-barriers
2517      StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2518        = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2519                                     /*dest_uninitialized*/true);
2520      StubRoutines::_arrayof_oop_arraycopy_uninit
2521        = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2522                                     /*dest_uninitialized*/true);
2523    }
2524
2525    StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2526    StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2527    StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2528    StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2529
2530    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2531    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2532                                                                        /*dest_uninitialized*/true);
2533
2534    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2535                                                              entry_jbyte_arraycopy,
2536                                                              entry_jshort_arraycopy,
2537                                                              entry_jint_arraycopy,
2538                                                              entry_jlong_arraycopy);
2539
2540    StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2541                                                               entry_jbyte_arraycopy,
2542                                                               entry_jshort_arraycopy,
2543                                                               entry_jint_arraycopy,
2544                                                               entry_oop_arraycopy,
2545                                                               entry_jlong_arraycopy,
2546                                                               entry_checkcast_arraycopy);
2547
2548    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2549    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2550    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2551    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2552    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2553    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2554  }
2555
2556  void generate_math_stubs() { Unimplemented(); }
2557
2558  // Arguments:
2559  //
2560  // Inputs:
2561  //   c_rarg0   - source byte array address
2562  //   c_rarg1   - destination byte array address
2563  //   c_rarg2   - K (key) in little endian int array
2564  //
2565  address generate_aescrypt_encryptBlock() {
2566    __ align(CodeEntryAlignment);
2567    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2568
2569    Label L_doLast;
2570
2571    const Register from        = c_rarg0;  // source array address
2572    const Register to          = c_rarg1;  // destination array address
2573    const Register key         = c_rarg2;  // key array address
2574    const Register keylen      = rscratch1;
2575
2576    address start = __ pc();
2577    __ enter();
2578
2579    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2580
2581    __ ld1(v0, __ T16B, from); // get 16 bytes of input
2582
2583    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2584    __ rev32(v1, __ T16B, v1);
2585    __ rev32(v2, __ T16B, v2);
2586    __ rev32(v3, __ T16B, v3);
2587    __ rev32(v4, __ T16B, v4);
2588    __ aese(v0, v1);
2589    __ aesmc(v0, v0);
2590    __ aese(v0, v2);
2591    __ aesmc(v0, v0);
2592    __ aese(v0, v3);
2593    __ aesmc(v0, v0);
2594    __ aese(v0, v4);
2595    __ aesmc(v0, v0);
2596
2597    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2598    __ rev32(v1, __ T16B, v1);
2599    __ rev32(v2, __ T16B, v2);
2600    __ rev32(v3, __ T16B, v3);
2601    __ rev32(v4, __ T16B, v4);
2602    __ aese(v0, v1);
2603    __ aesmc(v0, v0);
2604    __ aese(v0, v2);
2605    __ aesmc(v0, v0);
2606    __ aese(v0, v3);
2607    __ aesmc(v0, v0);
2608    __ aese(v0, v4);
2609    __ aesmc(v0, v0);
2610
2611    __ ld1(v1, v2, __ T16B, __ post(key, 32));
2612    __ rev32(v1, __ T16B, v1);
2613    __ rev32(v2, __ T16B, v2);
2614
2615    __ cmpw(keylen, 44);
2616    __ br(Assembler::EQ, L_doLast);
2617
2618    __ aese(v0, v1);
2619    __ aesmc(v0, v0);
2620    __ aese(v0, v2);
2621    __ aesmc(v0, v0);
2622
2623    __ ld1(v1, v2, __ T16B, __ post(key, 32));
2624    __ rev32(v1, __ T16B, v1);
2625    __ rev32(v2, __ T16B, v2);
2626
2627    __ cmpw(keylen, 52);
2628    __ br(Assembler::EQ, L_doLast);
2629
2630    __ aese(v0, v1);
2631    __ aesmc(v0, v0);
2632    __ aese(v0, v2);
2633    __ aesmc(v0, v0);
2634
2635    __ ld1(v1, v2, __ T16B, __ post(key, 32));
2636    __ rev32(v1, __ T16B, v1);
2637    __ rev32(v2, __ T16B, v2);
2638
2639    __ BIND(L_doLast);
2640
2641    __ aese(v0, v1);
2642    __ aesmc(v0, v0);
2643    __ aese(v0, v2);
2644
2645    __ ld1(v1, __ T16B, key);
2646    __ rev32(v1, __ T16B, v1);
2647    __ eor(v0, __ T16B, v0, v1);
2648
2649    __ st1(v0, __ T16B, to);
2650
2651    __ mov(r0, 0);
2652
2653    __ leave();
2654    __ ret(lr);
2655
2656    return start;
2657  }
2658
2659  // Arguments:
2660  //
2661  // Inputs:
2662  //   c_rarg0   - source byte array address
2663  //   c_rarg1   - destination byte array address
2664  //   c_rarg2   - K (key) in little endian int array
2665  //
2666  address generate_aescrypt_decryptBlock() {
2667    assert(UseAES, "need AES instructions and misaligned SSE support");
2668    __ align(CodeEntryAlignment);
2669    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2670    Label L_doLast;
2671
2672    const Register from        = c_rarg0;  // source array address
2673    const Register to          = c_rarg1;  // destination array address
2674    const Register key         = c_rarg2;  // key array address
2675    const Register keylen      = rscratch1;
2676
2677    address start = __ pc();
2678    __ enter(); // required for proper stackwalking of RuntimeStub frame
2679
2680    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2681
2682    __ ld1(v0, __ T16B, from); // get 16 bytes of input
2683
2684    __ ld1(v5, __ T16B, __ post(key, 16));
2685    __ rev32(v5, __ T16B, v5);
2686
2687    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2688    __ rev32(v1, __ T16B, v1);
2689    __ rev32(v2, __ T16B, v2);
2690    __ rev32(v3, __ T16B, v3);
2691    __ rev32(v4, __ T16B, v4);
2692    __ aesd(v0, v1);
2693    __ aesimc(v0, v0);
2694    __ aesd(v0, v2);
2695    __ aesimc(v0, v0);
2696    __ aesd(v0, v3);
2697    __ aesimc(v0, v0);
2698    __ aesd(v0, v4);
2699    __ aesimc(v0, v0);
2700
2701    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2702    __ rev32(v1, __ T16B, v1);
2703    __ rev32(v2, __ T16B, v2);
2704    __ rev32(v3, __ T16B, v3);
2705    __ rev32(v4, __ T16B, v4);
2706    __ aesd(v0, v1);
2707    __ aesimc(v0, v0);
2708    __ aesd(v0, v2);
2709    __ aesimc(v0, v0);
2710    __ aesd(v0, v3);
2711    __ aesimc(v0, v0);
2712    __ aesd(v0, v4);
2713    __ aesimc(v0, v0);
2714
2715    __ ld1(v1, v2, __ T16B, __ post(key, 32));
2716    __ rev32(v1, __ T16B, v1);
2717    __ rev32(v2, __ T16B, v2);
2718
2719    __ cmpw(keylen, 44);
2720    __ br(Assembler::EQ, L_doLast);
2721
2722    __ aesd(v0, v1);
2723    __ aesimc(v0, v0);
2724    __ aesd(v0, v2);
2725    __ aesimc(v0, v0);
2726
2727    __ ld1(v1, v2, __ T16B, __ post(key, 32));
2728    __ rev32(v1, __ T16B, v1);
2729    __ rev32(v2, __ T16B, v2);
2730
2731    __ cmpw(keylen, 52);
2732    __ br(Assembler::EQ, L_doLast);
2733
2734    __ aesd(v0, v1);
2735    __ aesimc(v0, v0);
2736    __ aesd(v0, v2);
2737    __ aesimc(v0, v0);
2738
2739    __ ld1(v1, v2, __ T16B, __ post(key, 32));
2740    __ rev32(v1, __ T16B, v1);
2741    __ rev32(v2, __ T16B, v2);
2742
2743    __ BIND(L_doLast);
2744
2745    __ aesd(v0, v1);
2746    __ aesimc(v0, v0);
2747    __ aesd(v0, v2);
2748
2749    __ eor(v0, __ T16B, v0, v5);
2750
2751    __ st1(v0, __ T16B, to);
2752
2753    __ mov(r0, 0);
2754
2755    __ leave();
2756    __ ret(lr);
2757
2758    return start;
2759  }
2760
2761  // Arguments:
2762  //
2763  // Inputs:
2764  //   c_rarg0   - source byte array address
2765  //   c_rarg1   - destination byte array address
2766  //   c_rarg2   - K (key) in little endian int array
2767  //   c_rarg3   - r vector byte array address
2768  //   c_rarg4   - input length
2769  //
2770  // Output:
2771  //   x0        - input length
2772  //
2773  address generate_cipherBlockChaining_encryptAESCrypt() {
2774    assert(UseAES, "need AES instructions and misaligned SSE support");
2775    __ align(CodeEntryAlignment);
2776    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2777
2778    Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2779
2780    const Register from        = c_rarg0;  // source array address
2781    const Register to          = c_rarg1;  // destination array address
2782    const Register key         = c_rarg2;  // key array address
2783    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2784                                           // and left with the results of the last encryption block
2785    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2786    const Register keylen      = rscratch1;
2787
2788    address start = __ pc();
2789
2790      __ enter();
2791
2792      __ movw(rscratch2, len_reg);
2793
2794      __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2795
2796      __ ld1(v0, __ T16B, rvec);
2797
2798      __ cmpw(keylen, 52);
2799      __ br(Assembler::CC, L_loadkeys_44);
2800      __ br(Assembler::EQ, L_loadkeys_52);
2801
2802      __ ld1(v17, v18, __ T16B, __ post(key, 32));
2803      __ rev32(v17, __ T16B, v17);
2804      __ rev32(v18, __ T16B, v18);
2805    __ BIND(L_loadkeys_52);
2806      __ ld1(v19, v20, __ T16B, __ post(key, 32));
2807      __ rev32(v19, __ T16B, v19);
2808      __ rev32(v20, __ T16B, v20);
2809    __ BIND(L_loadkeys_44);
2810      __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2811      __ rev32(v21, __ T16B, v21);
2812      __ rev32(v22, __ T16B, v22);
2813      __ rev32(v23, __ T16B, v23);
2814      __ rev32(v24, __ T16B, v24);
2815      __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2816      __ rev32(v25, __ T16B, v25);
2817      __ rev32(v26, __ T16B, v26);
2818      __ rev32(v27, __ T16B, v27);
2819      __ rev32(v28, __ T16B, v28);
2820      __ ld1(v29, v30, v31, __ T16B, key);
2821      __ rev32(v29, __ T16B, v29);
2822      __ rev32(v30, __ T16B, v30);
2823      __ rev32(v31, __ T16B, v31);
2824
2825    __ BIND(L_aes_loop);
2826      __ ld1(v1, __ T16B, __ post(from, 16));
2827      __ eor(v0, __ T16B, v0, v1);
2828
2829      __ br(Assembler::CC, L_rounds_44);
2830      __ br(Assembler::EQ, L_rounds_52);
2831
2832      __ aese(v0, v17); __ aesmc(v0, v0);
2833      __ aese(v0, v18); __ aesmc(v0, v0);
2834    __ BIND(L_rounds_52);
2835      __ aese(v0, v19); __ aesmc(v0, v0);
2836      __ aese(v0, v20); __ aesmc(v0, v0);
2837    __ BIND(L_rounds_44);
2838      __ aese(v0, v21); __ aesmc(v0, v0);
2839      __ aese(v0, v22); __ aesmc(v0, v0);
2840      __ aese(v0, v23); __ aesmc(v0, v0);
2841      __ aese(v0, v24); __ aesmc(v0, v0);
2842      __ aese(v0, v25); __ aesmc(v0, v0);
2843      __ aese(v0, v26); __ aesmc(v0, v0);
2844      __ aese(v0, v27); __ aesmc(v0, v0);
2845      __ aese(v0, v28); __ aesmc(v0, v0);
2846      __ aese(v0, v29); __ aesmc(v0, v0);
2847      __ aese(v0, v30);
2848      __ eor(v0, __ T16B, v0, v31);
2849
2850      __ st1(v0, __ T16B, __ post(to, 16));
2851
2852      __ subw(len_reg, len_reg, 16);
2853      __ cbnzw(len_reg, L_aes_loop);
2854
2855      __ st1(v0, __ T16B, rvec);
2856
2857      __ mov(r0, rscratch2);
2858
2859      __ leave();
2860      __ ret(lr);
2861
2862      return start;
2863  }
2864
2865  // Arguments:
2866  //
2867  // Inputs:
2868  //   c_rarg0   - source byte array address
2869  //   c_rarg1   - destination byte array address
2870  //   c_rarg2   - K (key) in little endian int array
2871  //   c_rarg3   - r vector byte array address
2872  //   c_rarg4   - input length
2873  //
2874  // Output:
2875  //   r0        - input length
2876  //
2877  address generate_cipherBlockChaining_decryptAESCrypt() {
2878    assert(UseAES, "need AES instructions and misaligned SSE support");
2879    __ align(CodeEntryAlignment);
2880    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2881
2882    Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2883
2884    const Register from        = c_rarg0;  // source array address
2885    const Register to          = c_rarg1;  // destination array address
2886    const Register key         = c_rarg2;  // key array address
2887    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2888                                           // and left with the results of the last encryption block
2889    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2890    const Register keylen      = rscratch1;
2891
2892    address start = __ pc();
2893
2894      __ enter();
2895
2896      __ movw(rscratch2, len_reg);
2897
2898      __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2899
2900      __ ld1(v2, __ T16B, rvec);
2901
2902      __ ld1(v31, __ T16B, __ post(key, 16));
2903      __ rev32(v31, __ T16B, v31);
2904
2905      __ cmpw(keylen, 52);
2906      __ br(Assembler::CC, L_loadkeys_44);
2907      __ br(Assembler::EQ, L_loadkeys_52);
2908
2909      __ ld1(v17, v18, __ T16B, __ post(key, 32));
2910      __ rev32(v17, __ T16B, v17);
2911      __ rev32(v18, __ T16B, v18);
2912    __ BIND(L_loadkeys_52);
2913      __ ld1(v19, v20, __ T16B, __ post(key, 32));
2914      __ rev32(v19, __ T16B, v19);
2915      __ rev32(v20, __ T16B, v20);
2916    __ BIND(L_loadkeys_44);
2917      __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2918      __ rev32(v21, __ T16B, v21);
2919      __ rev32(v22, __ T16B, v22);
2920      __ rev32(v23, __ T16B, v23);
2921      __ rev32(v24, __ T16B, v24);
2922      __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2923      __ rev32(v25, __ T16B, v25);
2924      __ rev32(v26, __ T16B, v26);
2925      __ rev32(v27, __ T16B, v27);
2926      __ rev32(v28, __ T16B, v28);
2927      __ ld1(v29, v30, __ T16B, key);
2928      __ rev32(v29, __ T16B, v29);
2929      __ rev32(v30, __ T16B, v30);
2930
2931    __ BIND(L_aes_loop);
2932      __ ld1(v0, __ T16B, __ post(from, 16));
2933      __ orr(v1, __ T16B, v0, v0);
2934
2935      __ br(Assembler::CC, L_rounds_44);
2936      __ br(Assembler::EQ, L_rounds_52);
2937
2938      __ aesd(v0, v17); __ aesimc(v0, v0);
2939      __ aesd(v0, v18); __ aesimc(v0, v0);
2940    __ BIND(L_rounds_52);
2941      __ aesd(v0, v19); __ aesimc(v0, v0);
2942      __ aesd(v0, v20); __ aesimc(v0, v0);
2943    __ BIND(L_rounds_44);
2944      __ aesd(v0, v21); __ aesimc(v0, v0);
2945      __ aesd(v0, v22); __ aesimc(v0, v0);
2946      __ aesd(v0, v23); __ aesimc(v0, v0);
2947      __ aesd(v0, v24); __ aesimc(v0, v0);
2948      __ aesd(v0, v25); __ aesimc(v0, v0);
2949      __ aesd(v0, v26); __ aesimc(v0, v0);
2950      __ aesd(v0, v27); __ aesimc(v0, v0);
2951      __ aesd(v0, v28); __ aesimc(v0, v0);
2952      __ aesd(v0, v29); __ aesimc(v0, v0);
2953      __ aesd(v0, v30);
2954      __ eor(v0, __ T16B, v0, v31);
2955      __ eor(v0, __ T16B, v0, v2);
2956
2957      __ st1(v0, __ T16B, __ post(to, 16));
2958      __ orr(v2, __ T16B, v1, v1);
2959
2960      __ subw(len_reg, len_reg, 16);
2961      __ cbnzw(len_reg, L_aes_loop);
2962
2963      __ st1(v2, __ T16B, rvec);
2964
2965      __ mov(r0, rscratch2);
2966
2967      __ leave();
2968      __ ret(lr);
2969
2970    return start;
2971  }
2972
2973  // Arguments:
2974  //
2975  // Inputs:
2976  //   c_rarg0   - byte[]  source+offset
2977  //   c_rarg1   - int[]   SHA.state
2978  //   c_rarg2   - int     offset
2979  //   c_rarg3   - int     limit
2980  //
2981  address generate_sha1_implCompress(bool multi_block, const char *name) {
2982    __ align(CodeEntryAlignment);
2983    StubCodeMark mark(this, "StubRoutines", name);
2984    address start = __ pc();
2985
2986    Register buf   = c_rarg0;
2987    Register state = c_rarg1;
2988    Register ofs   = c_rarg2;
2989    Register limit = c_rarg3;
2990
2991    Label keys;
2992    Label sha1_loop;
2993
2994    // load the keys into v0..v3
2995    __ adr(rscratch1, keys);
2996    __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2997    // load 5 words state into v6, v7
2998    __ ldrq(v6, Address(state, 0));
2999    __ ldrs(v7, Address(state, 16));
3000
3001
3002    __ BIND(sha1_loop);
3003    // load 64 bytes of data into v16..v19
3004    __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3005    __ rev32(v16, __ T16B, v16);
3006    __ rev32(v17, __ T16B, v17);
3007    __ rev32(v18, __ T16B, v18);
3008    __ rev32(v19, __ T16B, v19);
3009
3010    // do the sha1
3011    __ addv(v4, __ T4S, v16, v0);
3012    __ orr(v20, __ T16B, v6, v6);
3013
3014    FloatRegister d0 = v16;
3015    FloatRegister d1 = v17;
3016    FloatRegister d2 = v18;
3017    FloatRegister d3 = v19;
3018
3019    for (int round = 0; round < 20; round++) {
3020      FloatRegister tmp1 = (round & 1) ? v4 : v5;
3021      FloatRegister tmp2 = (round & 1) ? v21 : v22;
3022      FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3023      FloatRegister tmp4 = (round & 1) ? v5 : v4;
3024      FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3025
3026      if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3027      if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3028      __ sha1h(tmp2, __ T4S, v20);
3029      if (round < 5)
3030        __ sha1c(v20, __ T4S, tmp3, tmp4);
3031      else if (round < 10 || round >= 15)
3032        __ sha1p(v20, __ T4S, tmp3, tmp4);
3033      else
3034        __ sha1m(v20, __ T4S, tmp3, tmp4);
3035      if (round < 16) __ sha1su1(d0, __ T4S, d3);
3036
3037      tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3038    }
3039
3040    __ addv(v7, __ T2S, v7, v21);
3041    __ addv(v6, __ T4S, v6, v20);
3042
3043    if (multi_block) {
3044      __ add(ofs, ofs, 64);
3045      __ cmp(ofs, limit);
3046      __ br(Assembler::LE, sha1_loop);
3047      __ mov(c_rarg0, ofs); // return ofs
3048    }
3049
3050    __ strq(v6, Address(state, 0));
3051    __ strs(v7, Address(state, 16));
3052
3053    __ ret(lr);
3054
3055    __ bind(keys);
3056    __ emit_int32(0x5a827999);
3057    __ emit_int32(0x6ed9eba1);
3058    __ emit_int32(0x8f1bbcdc);
3059    __ emit_int32(0xca62c1d6);
3060
3061    return start;
3062  }
3063
3064
3065  // Arguments:
3066  //
3067  // Inputs:
3068  //   c_rarg0   - byte[]  source+offset
3069  //   c_rarg1   - int[]   SHA.state
3070  //   c_rarg2   - int     offset
3071  //   c_rarg3   - int     limit
3072  //
3073  address generate_sha256_implCompress(bool multi_block, const char *name) {
3074    static const uint32_t round_consts[64] = {
3075      0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3076      0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3077      0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3078      0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3079      0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3080      0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3081      0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3082      0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3083      0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3084      0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3085      0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3086      0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3087      0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3088      0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3089      0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3090      0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3091    };
3092    __ align(CodeEntryAlignment);
3093    StubCodeMark mark(this, "StubRoutines", name);
3094    address start = __ pc();
3095
3096    Register buf   = c_rarg0;
3097    Register state = c_rarg1;
3098    Register ofs   = c_rarg2;
3099    Register limit = c_rarg3;
3100
3101    Label sha1_loop;
3102
3103    __ stpd(v8, v9, __ pre(sp, -32));
3104    __ stpd(v10, v11, Address(sp, 16));
3105
3106// dga == v0
3107// dgb == v1
3108// dg0 == v2
3109// dg1 == v3
3110// dg2 == v4
3111// t0 == v6
3112// t1 == v7
3113
3114    // load 16 keys to v16..v31
3115    __ lea(rscratch1, ExternalAddress((address)round_consts));
3116    __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3117    __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3118    __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3119    __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3120
3121    // load 8 words (256 bits) state
3122    __ ldpq(v0, v1, state);
3123
3124    __ BIND(sha1_loop);
3125    // load 64 bytes of data into v8..v11
3126    __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3127    __ rev32(v8, __ T16B, v8);
3128    __ rev32(v9, __ T16B, v9);
3129    __ rev32(v10, __ T16B, v10);
3130    __ rev32(v11, __ T16B, v11);
3131
3132    __ addv(v6, __ T4S, v8, v16);
3133    __ orr(v2, __ T16B, v0, v0);
3134    __ orr(v3, __ T16B, v1, v1);
3135
3136    FloatRegister d0 = v8;
3137    FloatRegister d1 = v9;
3138    FloatRegister d2 = v10;
3139    FloatRegister d3 = v11;
3140
3141
3142    for (int round = 0; round < 16; round++) {
3143      FloatRegister tmp1 = (round & 1) ? v6 : v7;
3144      FloatRegister tmp2 = (round & 1) ? v7 : v6;
3145      FloatRegister tmp3 = (round & 1) ? v2 : v4;
3146      FloatRegister tmp4 = (round & 1) ? v4 : v2;
3147
3148      if (round < 12) __ sha256su0(d0, __ T4S, d1);
3149       __ orr(v4, __ T16B, v2, v2);
3150      if (round < 15)
3151        __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3152      __ sha256h(v2, __ T4S, v3, tmp2);
3153      __ sha256h2(v3, __ T4S, v4, tmp2);
3154      if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3155
3156      tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3157    }
3158
3159    __ addv(v0, __ T4S, v0, v2);
3160    __ addv(v1, __ T4S, v1, v3);
3161
3162    if (multi_block) {
3163      __ add(ofs, ofs, 64);
3164      __ cmp(ofs, limit);
3165      __ br(Assembler::LE, sha1_loop);
3166      __ mov(c_rarg0, ofs); // return ofs
3167    }
3168
3169    __ ldpd(v10, v11, Address(sp, 16));
3170    __ ldpd(v8, v9, __ post(sp, 32));
3171
3172    __ stpq(v0, v1, state);
3173
3174    __ ret(lr);
3175
3176    return start;
3177  }
3178
3179#ifndef BUILTIN_SIM
3180  // Safefetch stubs.
3181  void generate_safefetch(const char* name, int size, address* entry,
3182                          address* fault_pc, address* continuation_pc) {
3183    // safefetch signatures:
3184    //   int      SafeFetch32(int*      adr, int      errValue);
3185    //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3186    //
3187    // arguments:
3188    //   c_rarg0 = adr
3189    //   c_rarg1 = errValue
3190    //
3191    // result:
3192    //   PPC_RET  = *adr or errValue
3193
3194    StubCodeMark mark(this, "StubRoutines", name);
3195
3196    // Entry point, pc or function descriptor.
3197    *entry = __ pc();
3198
3199    // Load *adr into c_rarg1, may fault.
3200    *fault_pc = __ pc();
3201    switch (size) {
3202      case 4:
3203        // int32_t
3204        __ ldrw(c_rarg1, Address(c_rarg0, 0));
3205        break;
3206      case 8:
3207        // int64_t
3208        __ ldr(c_rarg1, Address(c_rarg0, 0));
3209        break;
3210      default:
3211        ShouldNotReachHere();
3212    }
3213
3214    // return errValue or *adr
3215    *continuation_pc = __ pc();
3216    __ mov(r0, c_rarg1);
3217    __ ret(lr);
3218  }
3219#endif
3220
3221  /**
3222   *  Arguments:
3223   *
3224   * Inputs:
3225   *   c_rarg0   - int crc
3226   *   c_rarg1   - byte* buf
3227   *   c_rarg2   - int length
3228   *
3229   * Ouput:
3230   *       rax   - int crc result
3231   */
3232  address generate_updateBytesCRC32() {
3233    assert(UseCRC32Intrinsics, "what are we doing here?");
3234
3235    __ align(CodeEntryAlignment);
3236    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3237
3238    address start = __ pc();
3239
3240    const Register crc   = c_rarg0;  // crc
3241    const Register buf   = c_rarg1;  // source java byte array address
3242    const Register len   = c_rarg2;  // length
3243    const Register table0 = c_rarg3; // crc_table address
3244    const Register table1 = c_rarg4;
3245    const Register table2 = c_rarg5;
3246    const Register table3 = c_rarg6;
3247    const Register tmp3 = c_rarg7;
3248
3249    BLOCK_COMMENT("Entry:");
3250    __ enter(); // required for proper stackwalking of RuntimeStub frame
3251
3252    __ kernel_crc32(crc, buf, len,
3253              table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3254
3255    __ leave(); // required for proper stackwalking of RuntimeStub frame
3256    __ ret(lr);
3257
3258    return start;
3259  }
3260
3261  /**
3262   *  Arguments:
3263   *
3264   * Inputs:
3265   *   c_rarg0   - int crc
3266   *   c_rarg1   - byte* buf
3267   *   c_rarg2   - int length
3268   *   c_rarg3   - int* table
3269   *
3270   * Ouput:
3271   *       r0   - int crc result
3272   */
3273  address generate_updateBytesCRC32C() {
3274    assert(UseCRC32CIntrinsics, "what are we doing here?");
3275
3276    __ align(CodeEntryAlignment);
3277    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3278
3279    address start = __ pc();
3280
3281    const Register crc   = c_rarg0;  // crc
3282    const Register buf   = c_rarg1;  // source java byte array address
3283    const Register len   = c_rarg2;  // length
3284    const Register table0 = c_rarg3; // crc_table address
3285    const Register table1 = c_rarg4;
3286    const Register table2 = c_rarg5;
3287    const Register table3 = c_rarg6;
3288    const Register tmp3 = c_rarg7;
3289
3290    BLOCK_COMMENT("Entry:");
3291    __ enter(); // required for proper stackwalking of RuntimeStub frame
3292
3293    __ kernel_crc32c(crc, buf, len,
3294              table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3295
3296    __ leave(); // required for proper stackwalking of RuntimeStub frame
3297    __ ret(lr);
3298
3299    return start;
3300  }
3301
3302  /***
3303   *  Arguments:
3304   *
3305   *  Inputs:
3306   *   c_rarg0   - int   adler
3307   *   c_rarg1   - byte* buff
3308   *   c_rarg2   - int   len
3309   *
3310   * Output:
3311   *   c_rarg0   - int adler result
3312   */
3313  address generate_updateBytesAdler32() {
3314    __ align(CodeEntryAlignment);
3315    StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3316    address start = __ pc();
3317
3318    Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3319
3320    // Aliases
3321    Register adler  = c_rarg0;
3322    Register s1     = c_rarg0;
3323    Register s2     = c_rarg3;
3324    Register buff   = c_rarg1;
3325    Register len    = c_rarg2;
3326    Register nmax  = r4;
3327    Register base = r5;
3328    Register count = r6;
3329    Register temp0 = rscratch1;
3330    Register temp1 = rscratch2;
3331    Register temp2 = r7;
3332
3333    // Max number of bytes we can process before having to take the mod
3334    // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3335    unsigned long BASE = 0xfff1;
3336    unsigned long NMAX = 0x15B0;
3337
3338    __ mov(base, BASE);
3339    __ mov(nmax, NMAX);
3340
3341    // s1 is initialized to the lower 16 bits of adler
3342    // s2 is initialized to the upper 16 bits of adler
3343    __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3344    __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3345
3346    // The pipelined loop needs at least 16 elements for 1 iteration
3347    // It does check this, but it is more effective to skip to the cleanup loop
3348    __ cmp(len, 16);
3349    __ br(Assembler::HS, L_nmax);
3350    __ cbz(len, L_combine);
3351
3352    __ bind(L_simple_by1_loop);
3353    __ ldrb(temp0, Address(__ post(buff, 1)));
3354    __ add(s1, s1, temp0);
3355    __ add(s2, s2, s1);
3356    __ subs(len, len, 1);
3357    __ br(Assembler::HI, L_simple_by1_loop);
3358
3359    // s1 = s1 % BASE
3360    __ subs(temp0, s1, base);
3361    __ csel(s1, temp0, s1, Assembler::HS);
3362
3363    // s2 = s2 % BASE
3364    __ lsr(temp0, s2, 16);
3365    __ lsl(temp1, temp0, 4);
3366    __ sub(temp1, temp1, temp0);
3367    __ add(s2, temp1, s2, ext::uxth);
3368
3369    __ subs(temp0, s2, base);
3370    __ csel(s2, temp0, s2, Assembler::HS);
3371
3372    __ b(L_combine);
3373
3374    __ bind(L_nmax);
3375    __ subs(len, len, nmax);
3376    __ sub(count, nmax, 16);
3377    __ br(Assembler::LO, L_by16);
3378
3379    __ bind(L_nmax_loop);
3380
3381    __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3382
3383    __ add(s1, s1, temp0, ext::uxtb);
3384    __ ubfx(temp2, temp0, 8, 8);
3385    __ add(s2, s2, s1);
3386    __ add(s1, s1, temp2);
3387    __ ubfx(temp2, temp0, 16, 8);
3388    __ add(s2, s2, s1);
3389    __ add(s1, s1, temp2);
3390    __ ubfx(temp2, temp0, 24, 8);
3391    __ add(s2, s2, s1);
3392    __ add(s1, s1, temp2);
3393    __ ubfx(temp2, temp0, 32, 8);
3394    __ add(s2, s2, s1);
3395    __ add(s1, s1, temp2);
3396    __ ubfx(temp2, temp0, 40, 8);
3397    __ add(s2, s2, s1);
3398    __ add(s1, s1, temp2);
3399    __ ubfx(temp2, temp0, 48, 8);
3400    __ add(s2, s2, s1);
3401    __ add(s1, s1, temp2);
3402    __ add(s2, s2, s1);
3403    __ add(s1, s1, temp0, Assembler::LSR, 56);
3404    __ add(s2, s2, s1);
3405
3406    __ add(s1, s1, temp1, ext::uxtb);
3407    __ ubfx(temp2, temp1, 8, 8);
3408    __ add(s2, s2, s1);
3409    __ add(s1, s1, temp2);
3410    __ ubfx(temp2, temp1, 16, 8);
3411    __ add(s2, s2, s1);
3412    __ add(s1, s1, temp2);
3413    __ ubfx(temp2, temp1, 24, 8);
3414    __ add(s2, s2, s1);
3415    __ add(s1, s1, temp2);
3416    __ ubfx(temp2, temp1, 32, 8);
3417    __ add(s2, s2, s1);
3418    __ add(s1, s1, temp2);
3419    __ ubfx(temp2, temp1, 40, 8);
3420    __ add(s2, s2, s1);
3421    __ add(s1, s1, temp2);
3422    __ ubfx(temp2, temp1, 48, 8);
3423    __ add(s2, s2, s1);
3424    __ add(s1, s1, temp2);
3425    __ add(s2, s2, s1);
3426    __ add(s1, s1, temp1, Assembler::LSR, 56);
3427    __ add(s2, s2, s1);
3428
3429    __ subs(count, count, 16);
3430    __ br(Assembler::HS, L_nmax_loop);
3431
3432    // s1 = s1 % BASE
3433    __ lsr(temp0, s1, 16);
3434    __ lsl(temp1, temp0, 4);
3435    __ sub(temp1, temp1, temp0);
3436    __ add(temp1, temp1, s1, ext::uxth);
3437
3438    __ lsr(temp0, temp1, 16);
3439    __ lsl(s1, temp0, 4);
3440    __ sub(s1, s1, temp0);
3441    __ add(s1, s1, temp1, ext:: uxth);
3442
3443    __ subs(temp0, s1, base);
3444    __ csel(s1, temp0, s1, Assembler::HS);
3445
3446    // s2 = s2 % BASE
3447    __ lsr(temp0, s2, 16);
3448    __ lsl(temp1, temp0, 4);
3449    __ sub(temp1, temp1, temp0);
3450    __ add(temp1, temp1, s2, ext::uxth);
3451
3452    __ lsr(temp0, temp1, 16);
3453    __ lsl(s2, temp0, 4);
3454    __ sub(s2, s2, temp0);
3455    __ add(s2, s2, temp1, ext:: uxth);
3456
3457    __ subs(temp0, s2, base);
3458    __ csel(s2, temp0, s2, Assembler::HS);
3459
3460    __ subs(len, len, nmax);
3461    __ sub(count, nmax, 16);
3462    __ br(Assembler::HS, L_nmax_loop);
3463
3464    __ bind(L_by16);
3465    __ adds(len, len, count);
3466    __ br(Assembler::LO, L_by1);
3467
3468    __ bind(L_by16_loop);
3469
3470    __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3471
3472    __ add(s1, s1, temp0, ext::uxtb);
3473    __ ubfx(temp2, temp0, 8, 8);
3474    __ add(s2, s2, s1);
3475    __ add(s1, s1, temp2);
3476    __ ubfx(temp2, temp0, 16, 8);
3477    __ add(s2, s2, s1);
3478    __ add(s1, s1, temp2);
3479    __ ubfx(temp2, temp0, 24, 8);
3480    __ add(s2, s2, s1);
3481    __ add(s1, s1, temp2);
3482    __ ubfx(temp2, temp0, 32, 8);
3483    __ add(s2, s2, s1);
3484    __ add(s1, s1, temp2);
3485    __ ubfx(temp2, temp0, 40, 8);
3486    __ add(s2, s2, s1);
3487    __ add(s1, s1, temp2);
3488    __ ubfx(temp2, temp0, 48, 8);
3489    __ add(s2, s2, s1);
3490    __ add(s1, s1, temp2);
3491    __ add(s2, s2, s1);
3492    __ add(s1, s1, temp0, Assembler::LSR, 56);
3493    __ add(s2, s2, s1);
3494
3495    __ add(s1, s1, temp1, ext::uxtb);
3496    __ ubfx(temp2, temp1, 8, 8);
3497    __ add(s2, s2, s1);
3498    __ add(s1, s1, temp2);
3499    __ ubfx(temp2, temp1, 16, 8);
3500    __ add(s2, s2, s1);
3501    __ add(s1, s1, temp2);
3502    __ ubfx(temp2, temp1, 24, 8);
3503    __ add(s2, s2, s1);
3504    __ add(s1, s1, temp2);
3505    __ ubfx(temp2, temp1, 32, 8);
3506    __ add(s2, s2, s1);
3507    __ add(s1, s1, temp2);
3508    __ ubfx(temp2, temp1, 40, 8);
3509    __ add(s2, s2, s1);
3510    __ add(s1, s1, temp2);
3511    __ ubfx(temp2, temp1, 48, 8);
3512    __ add(s2, s2, s1);
3513    __ add(s1, s1, temp2);
3514    __ add(s2, s2, s1);
3515    __ add(s1, s1, temp1, Assembler::LSR, 56);
3516    __ add(s2, s2, s1);
3517
3518    __ subs(len, len, 16);
3519    __ br(Assembler::HS, L_by16_loop);
3520
3521    __ bind(L_by1);
3522    __ adds(len, len, 15);
3523    __ br(Assembler::LO, L_do_mod);
3524
3525    __ bind(L_by1_loop);
3526    __ ldrb(temp0, Address(__ post(buff, 1)));
3527    __ add(s1, temp0, s1);
3528    __ add(s2, s2, s1);
3529    __ subs(len, len, 1);
3530    __ br(Assembler::HS, L_by1_loop);
3531
3532    __ bind(L_do_mod);
3533    // s1 = s1 % BASE
3534    __ lsr(temp0, s1, 16);
3535    __ lsl(temp1, temp0, 4);
3536    __ sub(temp1, temp1, temp0);
3537    __ add(temp1, temp1, s1, ext::uxth);
3538
3539    __ lsr(temp0, temp1, 16);
3540    __ lsl(s1, temp0, 4);
3541    __ sub(s1, s1, temp0);
3542    __ add(s1, s1, temp1, ext:: uxth);
3543
3544    __ subs(temp0, s1, base);
3545    __ csel(s1, temp0, s1, Assembler::HS);
3546
3547    // s2 = s2 % BASE
3548    __ lsr(temp0, s2, 16);
3549    __ lsl(temp1, temp0, 4);
3550    __ sub(temp1, temp1, temp0);
3551    __ add(temp1, temp1, s2, ext::uxth);
3552
3553    __ lsr(temp0, temp1, 16);
3554    __ lsl(s2, temp0, 4);
3555    __ sub(s2, s2, temp0);
3556    __ add(s2, s2, temp1, ext:: uxth);
3557
3558    __ subs(temp0, s2, base);
3559    __ csel(s2, temp0, s2, Assembler::HS);
3560
3561    // Combine lower bits and higher bits
3562    __ bind(L_combine);
3563    __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3564
3565    __ ret(lr);
3566
3567    return start;
3568  }
3569
3570  /**
3571   *  Arguments:
3572   *
3573   *  Input:
3574   *    c_rarg0   - x address
3575   *    c_rarg1   - x length
3576   *    c_rarg2   - y address
3577   *    c_rarg3   - y lenth
3578   *    c_rarg4   - z address
3579   *    c_rarg5   - z length
3580   */
3581  address generate_multiplyToLen() {
3582    __ align(CodeEntryAlignment);
3583    StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3584
3585    address start = __ pc();
3586    const Register x     = r0;
3587    const Register xlen  = r1;
3588    const Register y     = r2;
3589    const Register ylen  = r3;
3590    const Register z     = r4;
3591    const Register zlen  = r5;
3592
3593    const Register tmp1  = r10;
3594    const Register tmp2  = r11;
3595    const Register tmp3  = r12;
3596    const Register tmp4  = r13;
3597    const Register tmp5  = r14;
3598    const Register tmp6  = r15;
3599    const Register tmp7  = r16;
3600
3601    BLOCK_COMMENT("Entry:");
3602    __ enter(); // required for proper stackwalking of RuntimeStub frame
3603    __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3604    __ leave(); // required for proper stackwalking of RuntimeStub frame
3605    __ ret(lr);
3606
3607    return start;
3608  }
3609
3610  void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3611                      FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3612                      FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3613    // Karatsuba multiplication performs a 128*128 -> 256-bit
3614    // multiplication in three 128-bit multiplications and a few
3615    // additions.
3616    //
3617    // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3618    // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3619    //
3620    // Inputs:
3621    //
3622    // A0 in a.d[0]     (subkey)
3623    // A1 in a.d[1]
3624    // (A1+A0) in a1_xor_a0.d[0]
3625    //
3626    // B0 in b.d[0]     (state)
3627    // B1 in b.d[1]
3628
3629    __ ext(tmp1, __ T16B, b, b, 0x08);
3630    __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3631    __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3632    __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3633    __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3634
3635    __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3636    __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3637    __ eor(tmp2, __ T16B, tmp2, tmp4);
3638    __ eor(tmp2, __ T16B, tmp2, tmp3);
3639
3640    // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3641    __ ins(result_hi, __ D, tmp2, 0, 1);
3642    __ ins(result_lo, __ D, tmp2, 1, 0);
3643  }
3644
3645  void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3646                    FloatRegister p, FloatRegister z, FloatRegister t1) {
3647    const FloatRegister t0 = result;
3648
3649    // The GCM field polynomial f is z^128 + p(z), where p =
3650    // z^7+z^2+z+1.
3651    //
3652    //    z^128 === -p(z)  (mod (z^128 + p(z)))
3653    //
3654    // so, given that the product we're reducing is
3655    //    a == lo + hi * z^128
3656    // substituting,
3657    //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3658    //
3659    // we reduce by multiplying hi by p(z) and subtracting the result
3660    // from (i.e. XORing it with) lo.  Because p has no nonzero high
3661    // bits we can do this with two 64-bit multiplications, lo*p and
3662    // hi*p.
3663
3664    __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3665    __ ext(t1, __ T16B, t0, z, 8);
3666    __ eor(hi, __ T16B, hi, t1);
3667    __ ext(t1, __ T16B, z, t0, 8);
3668    __ eor(lo, __ T16B, lo, t1);
3669    __ pmull(t0, __ T1Q, hi, p, __ T1D);
3670    __ eor(result, __ T16B, lo, t0);
3671  }
3672
3673  address generate_has_negatives(address &has_negatives_long) {
3674    StubCodeMark mark(this, "StubRoutines", "has_negatives");
3675    const int large_loop_size = 64;
3676    const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3677    int dcache_line = VM_Version::dcache_line_size();
3678
3679    Register ary1 = r1, len = r2, result = r0;
3680
3681    __ align(CodeEntryAlignment);
3682    address entry = __ pc();
3683
3684    __ enter();
3685
3686  Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3687        LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3688
3689  __ cmp(len, 15);
3690  __ br(Assembler::GT, LEN_OVER_15);
3691  // The only case when execution falls into this code is when pointer is near
3692  // the end of memory page and we have to avoid reading next page
3693  __ add(ary1, ary1, len);
3694  __ subs(len, len, 8);
3695  __ br(Assembler::GT, LEN_OVER_8);
3696  __ ldr(rscratch2, Address(ary1, -8));
3697  __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3698  __ lsrv(rscratch2, rscratch2, rscratch1);
3699  __ tst(rscratch2, UPPER_BIT_MASK);
3700  __ cset(result, Assembler::NE);
3701  __ leave();
3702  __ ret(lr);
3703  __ bind(LEN_OVER_8);
3704  __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3705  __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3706  __ tst(rscratch2, UPPER_BIT_MASK);
3707  __ br(Assembler::NE, RET_TRUE_NO_POP);
3708  __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3709  __ lsrv(rscratch1, rscratch1, rscratch2);
3710  __ tst(rscratch1, UPPER_BIT_MASK);
3711  __ cset(result, Assembler::NE);
3712  __ leave();
3713  __ ret(lr);
3714
3715  Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3716  const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3717
3718  has_negatives_long = __ pc(); // 2nd entry point
3719
3720  __ enter();
3721
3722  __ bind(LEN_OVER_15);
3723    __ push(spilled_regs, sp);
3724    __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3725    __ cbz(rscratch2, ALIGNED);
3726    __ ldp(tmp6, tmp1, Address(ary1));
3727    __ mov(tmp5, 16);
3728    __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3729    __ add(ary1, ary1, rscratch1);
3730    __ sub(len, len, rscratch1);
3731    __ orr(tmp6, tmp6, tmp1);
3732    __ tst(tmp6, UPPER_BIT_MASK);
3733    __ br(Assembler::NE, RET_TRUE);
3734
3735  __ bind(ALIGNED);
3736    __ cmp(len, large_loop_size);
3737    __ br(Assembler::LT, CHECK_16);
3738    // Perform 16-byte load as early return in pre-loop to handle situation
3739    // when initially aligned large array has negative values at starting bytes,
3740    // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3741    // slower. Cases with negative bytes further ahead won't be affected that
3742    // much. In fact, it'll be faster due to early loads, less instructions and
3743    // less branches in LARGE_LOOP.
3744    __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3745    __ sub(len, len, 16);
3746    __ orr(tmp6, tmp6, tmp1);
3747    __ tst(tmp6, UPPER_BIT_MASK);
3748    __ br(Assembler::NE, RET_TRUE);
3749    __ cmp(len, large_loop_size);
3750    __ br(Assembler::LT, CHECK_16);
3751
3752    if (SoftwarePrefetchHintDistance >= 0
3753        && SoftwarePrefetchHintDistance >= dcache_line) {
3754      // initial prefetch
3755      __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3756    }
3757  __ bind(LARGE_LOOP);
3758    if (SoftwarePrefetchHintDistance >= 0) {
3759      __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3760    }
3761    // Issue load instructions first, since it can save few CPU/MEM cycles, also
3762    // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3763    // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3764    // instructions per cycle and have less branches, but this approach disables
3765    // early return, thus, all 64 bytes are loaded and checked every time.
3766    __ ldp(tmp2, tmp3, Address(ary1));
3767    __ ldp(tmp4, tmp5, Address(ary1, 16));
3768    __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3769    __ ldp(tmp6, tmp1, Address(ary1, 48));
3770    __ add(ary1, ary1, large_loop_size);
3771    __ sub(len, len, large_loop_size);
3772    __ orr(tmp2, tmp2, tmp3);
3773    __ orr(tmp4, tmp4, tmp5);
3774    __ orr(rscratch1, rscratch1, rscratch2);
3775    __ orr(tmp6, tmp6, tmp1);
3776    __ orr(tmp2, tmp2, tmp4);
3777    __ orr(rscratch1, rscratch1, tmp6);
3778    __ orr(tmp2, tmp2, rscratch1);
3779    __ tst(tmp2, UPPER_BIT_MASK);
3780    __ br(Assembler::NE, RET_TRUE);
3781    __ cmp(len, large_loop_size);
3782    __ br(Assembler::GE, LARGE_LOOP);
3783
3784  __ bind(CHECK_16); // small 16-byte load pre-loop
3785    __ cmp(len, 16);
3786    __ br(Assembler::LT, POST_LOOP16);
3787
3788  __ bind(LOOP16); // small 16-byte load loop
3789    __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3790    __ sub(len, len, 16);
3791    __ orr(tmp2, tmp2, tmp3);
3792    __ tst(tmp2, UPPER_BIT_MASK);
3793    __ br(Assembler::NE, RET_TRUE);
3794    __ cmp(len, 16);
3795    __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3796
3797  __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3798    __ cmp(len, 8);
3799    __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3800    __ ldr(tmp3, Address(__ post(ary1, 8)));
3801    __ sub(len, len, 8);
3802    __ tst(tmp3, UPPER_BIT_MASK);
3803    __ br(Assembler::NE, RET_TRUE);
3804
3805  __ bind(POST_LOOP16_LOAD_TAIL);
3806    __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3807    __ ldr(tmp1, Address(ary1));
3808    __ mov(tmp2, 64);
3809    __ sub(tmp4, tmp2, len, __ LSL, 3);
3810    __ lslv(tmp1, tmp1, tmp4);
3811    __ tst(tmp1, UPPER_BIT_MASK);
3812    __ br(Assembler::NE, RET_TRUE);
3813    // Fallthrough
3814
3815  __ bind(RET_FALSE);
3816    __ pop(spilled_regs, sp);
3817    __ leave();
3818    __ mov(result, zr);
3819    __ ret(lr);
3820
3821  __ bind(RET_TRUE);
3822    __ pop(spilled_regs, sp);
3823  __ bind(RET_TRUE_NO_POP);
3824    __ leave();
3825    __ mov(result, 1);
3826    __ ret(lr);
3827
3828  __ bind(DONE);
3829    __ pop(spilled_regs, sp);
3830    __ leave();
3831    __ ret(lr);
3832    return entry;
3833  }
3834  /**
3835   *  Arguments:
3836   *
3837   *  Input:
3838   *  c_rarg0   - current state address
3839   *  c_rarg1   - H key address
3840   *  c_rarg2   - data address
3841   *  c_rarg3   - number of blocks
3842   *
3843   *  Output:
3844   *  Updated state at c_rarg0
3845   */
3846  address generate_ghash_processBlocks() {
3847    // Bafflingly, GCM uses little-endian for the byte order, but
3848    // big-endian for the bit order.  For example, the polynomial 1 is
3849    // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3850    //
3851    // So, we must either reverse the bytes in each word and do
3852    // everything big-endian or reverse the bits in each byte and do
3853    // it little-endian.  On AArch64 it's more idiomatic to reverse
3854    // the bits in each byte (we have an instruction, RBIT, to do
3855    // that) and keep the data in little-endian bit order throught the
3856    // calculation, bit-reversing the inputs and outputs.
3857
3858    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3859    __ align(wordSize * 2);
3860    address p = __ pc();
3861    __ emit_int64(0x87);  // The low-order bits of the field
3862                          // polynomial (i.e. p = z^7+z^2+z+1)
3863                          // repeated in the low and high parts of a
3864                          // 128-bit vector
3865    __ emit_int64(0x87);
3866
3867    __ align(CodeEntryAlignment);
3868    address start = __ pc();
3869
3870    Register state   = c_rarg0;
3871    Register subkeyH = c_rarg1;
3872    Register data    = c_rarg2;
3873    Register blocks  = c_rarg3;
3874
3875    FloatRegister vzr = v30;
3876    __ eor(vzr, __ T16B, vzr, vzr); // zero register
3877
3878    __ ldrq(v0, Address(state));
3879    __ ldrq(v1, Address(subkeyH));
3880
3881    __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3882    __ rbit(v0, __ T16B, v0);
3883    __ rev64(v1, __ T16B, v1);
3884    __ rbit(v1, __ T16B, v1);
3885
3886    __ ldrq(v26, p);
3887
3888    __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3889    __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3890
3891    {
3892      Label L_ghash_loop;
3893      __ bind(L_ghash_loop);
3894
3895      __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3896                                                 // reversing each byte
3897      __ rbit(v2, __ T16B, v2);
3898      __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3899
3900      // Multiply state in v2 by subkey in v1
3901      ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3902                     /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3903                     /*temps*/v6, v20, v18, v21);
3904      // Reduce v7:v5 by the field polynomial
3905      ghash_reduce(v0, v5, v7, v26, vzr, v20);
3906
3907      __ sub(blocks, blocks, 1);
3908      __ cbnz(blocks, L_ghash_loop);
3909    }
3910
3911    // The bit-reversed result is at this point in v0
3912    __ rev64(v1, __ T16B, v0);
3913    __ rbit(v1, __ T16B, v1);
3914
3915    __ st1(v1, __ T16B, state);
3916    __ ret(lr);
3917
3918    return start;
3919  }
3920
3921  // Continuation point for throwing of implicit exceptions that are
3922  // not handled in the current activation. Fabricates an exception
3923  // oop and initiates normal exception dispatching in this
3924  // frame. Since we need to preserve callee-saved values (currently
3925  // only for C2, but done for C1 as well) we need a callee-saved oop
3926  // map and therefore have to make these stubs into RuntimeStubs
3927  // rather than BufferBlobs.  If the compiler needs all registers to
3928  // be preserved between the fault point and the exception handler
3929  // then it must assume responsibility for that in
3930  // AbstractCompiler::continuation_for_implicit_null_exception or
3931  // continuation_for_implicit_division_by_zero_exception. All other
3932  // implicit exceptions (e.g., NullPointerException or
3933  // AbstractMethodError on entry) are either at call sites or
3934  // otherwise assume that stack unwinding will be initiated, so
3935  // caller saved registers were assumed volatile in the compiler.
3936
3937#undef __
3938#define __ masm->
3939
3940  address generate_throw_exception(const char* name,
3941                                   address runtime_entry,
3942                                   Register arg1 = noreg,
3943                                   Register arg2 = noreg) {
3944    // Information about frame layout at time of blocking runtime call.
3945    // Note that we only have to preserve callee-saved registers since
3946    // the compilers are responsible for supplying a continuation point
3947    // if they expect all registers to be preserved.
3948    // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3949    enum layout {
3950      rfp_off = 0,
3951      rfp_off2,
3952      return_off,
3953      return_off2,
3954      framesize // inclusive of return address
3955    };
3956
3957    int insts_size = 512;
3958    int locs_size  = 64;
3959
3960    CodeBuffer code(name, insts_size, locs_size);
3961    OopMapSet* oop_maps  = new OopMapSet();
3962    MacroAssembler* masm = new MacroAssembler(&code);
3963
3964    address start = __ pc();
3965
3966    // This is an inlined and slightly modified version of call_VM
3967    // which has the ability to fetch the return PC out of
3968    // thread-local storage and also sets up last_Java_sp slightly
3969    // differently than the real call_VM
3970
3971    __ enter(); // Save FP and LR before call
3972
3973    assert(is_even(framesize/2), "sp not 16-byte aligned");
3974
3975    // lr and fp are already in place
3976    __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3977
3978    int frame_complete = __ pc() - start;
3979
3980    // Set up last_Java_sp and last_Java_fp
3981    address the_pc = __ pc();
3982    __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3983
3984    // Call runtime
3985    if (arg1 != noreg) {
3986      assert(arg2 != c_rarg1, "clobbered");
3987      __ mov(c_rarg1, arg1);
3988    }
3989    if (arg2 != noreg) {
3990      __ mov(c_rarg2, arg2);
3991    }
3992    __ mov(c_rarg0, rthread);
3993    BLOCK_COMMENT("call runtime_entry");
3994    __ mov(rscratch1, runtime_entry);
3995    __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3996
3997    // Generate oop map
3998    OopMap* map = new OopMap(framesize, 0);
3999
4000    oop_maps->add_gc_map(the_pc - start, map);
4001
4002    __ reset_last_Java_frame(true);
4003    __ maybe_isb();
4004
4005    __ leave();
4006
4007    // check for pending exceptions
4008#ifdef ASSERT
4009    Label L;
4010    __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4011    __ cbnz(rscratch1, L);
4012    __ should_not_reach_here();
4013    __ bind(L);
4014#endif // ASSERT
4015    __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4016
4017
4018    // codeBlob framesize is in words (not VMRegImpl::slot_size)
4019    RuntimeStub* stub =
4020      RuntimeStub::new_runtime_stub(name,
4021                                    &code,
4022                                    frame_complete,
4023                                    (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4024                                    oop_maps, false);
4025    return stub->entry_point();
4026  }
4027
4028  class MontgomeryMultiplyGenerator : public MacroAssembler {
4029
4030    Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4031      Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4032
4033    RegSet _toSave;
4034    bool _squaring;
4035
4036  public:
4037    MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4038      : MacroAssembler(as->code()), _squaring(squaring) {
4039
4040      // Register allocation
4041
4042      Register reg = c_rarg0;
4043      Pa_base = reg;       // Argument registers
4044      if (squaring)
4045        Pb_base = Pa_base;
4046      else
4047        Pb_base = ++reg;
4048      Pn_base = ++reg;
4049      Rlen= ++reg;
4050      inv = ++reg;
4051      Pm_base = ++reg;
4052
4053                          // Working registers:
4054      Ra =  ++reg;        // The current digit of a, b, n, and m.
4055      Rb =  ++reg;
4056      Rm =  ++reg;
4057      Rn =  ++reg;
4058
4059      Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4060      Pb =  ++reg;
4061      Pm =  ++reg;
4062      Pn =  ++reg;
4063
4064      t0 =  ++reg;        // Three registers which form a
4065      t1 =  ++reg;        // triple-precision accumuator.
4066      t2 =  ++reg;
4067
4068      Ri =  ++reg;        // Inner and outer loop indexes.
4069      Rj =  ++reg;
4070
4071      Rhi_ab = ++reg;     // Product registers: low and high parts
4072      Rlo_ab = ++reg;     // of a*b and m*n.
4073      Rhi_mn = ++reg;
4074      Rlo_mn = ++reg;
4075
4076      // r19 and up are callee-saved.
4077      _toSave = RegSet::range(r19, reg) + Pm_base;
4078    }
4079
4080  private:
4081    void save_regs() {
4082      push(_toSave, sp);
4083    }
4084
4085    void restore_regs() {
4086      pop(_toSave, sp);
4087    }
4088
4089    template <typename T>
4090    void unroll_2(Register count, T block) {
4091      Label loop, end, odd;
4092      tbnz(count, 0, odd);
4093      cbz(count, end);
4094      align(16);
4095      bind(loop);
4096      (this->*block)();
4097      bind(odd);
4098      (this->*block)();
4099      subs(count, count, 2);
4100      br(Assembler::GT, loop);
4101      bind(end);
4102    }
4103
4104    template <typename T>
4105    void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4106      Label loop, end, odd;
4107      tbnz(count, 0, odd);
4108      cbz(count, end);
4109      align(16);
4110      bind(loop);
4111      (this->*block)(d, s, tmp);
4112      bind(odd);
4113      (this->*block)(d, s, tmp);
4114      subs(count, count, 2);
4115      br(Assembler::GT, loop);
4116      bind(end);
4117    }
4118
4119    void pre1(RegisterOrConstant i) {
4120      block_comment("pre1");
4121      // Pa = Pa_base;
4122      // Pb = Pb_base + i;
4123      // Pm = Pm_base;
4124      // Pn = Pn_base + i;
4125      // Ra = *Pa;
4126      // Rb = *Pb;
4127      // Rm = *Pm;
4128      // Rn = *Pn;
4129      ldr(Ra, Address(Pa_base));
4130      ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4131      ldr(Rm, Address(Pm_base));
4132      ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4133      lea(Pa, Address(Pa_base));
4134      lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4135      lea(Pm, Address(Pm_base));
4136      lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4137
4138      // Zero the m*n result.
4139      mov(Rhi_mn, zr);
4140      mov(Rlo_mn, zr);
4141    }
4142
4143    // The core multiply-accumulate step of a Montgomery
4144    // multiplication.  The idea is to schedule operations as a
4145    // pipeline so that instructions with long latencies (loads and
4146    // multiplies) have time to complete before their results are
4147    // used.  This most benefits in-order implementations of the
4148    // architecture but out-of-order ones also benefit.
4149    void step() {
4150      block_comment("step");
4151      // MACC(Ra, Rb, t0, t1, t2);
4152      // Ra = *++Pa;
4153      // Rb = *--Pb;
4154      umulh(Rhi_ab, Ra, Rb);
4155      mul(Rlo_ab, Ra, Rb);
4156      ldr(Ra, pre(Pa, wordSize));
4157      ldr(Rb, pre(Pb, -wordSize));
4158      acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4159                                       // previous iteration.
4160      // MACC(Rm, Rn, t0, t1, t2);
4161      // Rm = *++Pm;
4162      // Rn = *--Pn;
4163      umulh(Rhi_mn, Rm, Rn);
4164      mul(Rlo_mn, Rm, Rn);
4165      ldr(Rm, pre(Pm, wordSize));
4166      ldr(Rn, pre(Pn, -wordSize));
4167      acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4168    }
4169
4170    void post1() {
4171      block_comment("post1");
4172
4173      // MACC(Ra, Rb, t0, t1, t2);
4174      // Ra = *++Pa;
4175      // Rb = *--Pb;
4176      umulh(Rhi_ab, Ra, Rb);
4177      mul(Rlo_ab, Ra, Rb);
4178      acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4179      acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4180
4181      // *Pm = Rm = t0 * inv;
4182      mul(Rm, t0, inv);
4183      str(Rm, Address(Pm));
4184
4185      // MACC(Rm, Rn, t0, t1, t2);
4186      // t0 = t1; t1 = t2; t2 = 0;
4187      umulh(Rhi_mn, Rm, Rn);
4188
4189#ifndef PRODUCT
4190      // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4191      {
4192        mul(Rlo_mn, Rm, Rn);
4193        add(Rlo_mn, t0, Rlo_mn);
4194        Label ok;
4195        cbz(Rlo_mn, ok); {
4196          stop("broken Montgomery multiply");
4197        } bind(ok);
4198      }
4199#endif
4200      // We have very carefully set things up so that
4201      // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4202      // the lower half of Rm * Rn because we know the result already:
4203      // it must be -t0.  t0 + (-t0) must generate a carry iff
4204      // t0 != 0.  So, rather than do a mul and an adds we just set
4205      // the carry flag iff t0 is nonzero.
4206      //
4207      // mul(Rlo_mn, Rm, Rn);
4208      // adds(zr, t0, Rlo_mn);
4209      subs(zr, t0, 1); // Set carry iff t0 is nonzero
4210      adcs(t0, t1, Rhi_mn);
4211      adc(t1, t2, zr);
4212      mov(t2, zr);
4213    }
4214
4215    void pre2(RegisterOrConstant i, RegisterOrConstant len) {
4216      block_comment("pre2");
4217      // Pa = Pa_base + i-len;
4218      // Pb = Pb_base + len;
4219      // Pm = Pm_base + i-len;
4220      // Pn = Pn_base + len;
4221
4222      if (i.is_register()) {
4223        sub(Rj, i.as_register(), len);
4224      } else {
4225        mov(Rj, i.as_constant());
4226        sub(Rj, Rj, len);
4227      }
4228      // Rj == i-len
4229
4230      lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
4231      lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
4232      lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4233      lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
4234
4235      // Ra = *++Pa;
4236      // Rb = *--Pb;
4237      // Rm = *++Pm;
4238      // Rn = *--Pn;
4239      ldr(Ra, pre(Pa, wordSize));
4240      ldr(Rb, pre(Pb, -wordSize));
4241      ldr(Rm, pre(Pm, wordSize));
4242      ldr(Rn, pre(Pn, -wordSize));
4243
4244      mov(Rhi_mn, zr);
4245      mov(Rlo_mn, zr);
4246    }
4247
4248    void post2(RegisterOrConstant i, RegisterOrConstant len) {
4249      block_comment("post2");
4250      if (i.is_constant()) {
4251        mov(Rj, i.as_constant()-len.as_constant());
4252      } else {
4253        sub(Rj, i.as_register(), len);
4254      }
4255
4256      adds(t0, t0, Rlo_mn); // The pending m*n, low part
4257
4258      // As soon as we know the least significant digit of our result,
4259      // store it.
4260      // Pm_base[i-len] = t0;
4261      str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4262
4263      // t0 = t1; t1 = t2; t2 = 0;
4264      adcs(t0, t1, Rhi_mn); // The pending m*n, high part
4265      adc(t1, t2, zr);
4266      mov(t2, zr);
4267    }
4268
4269    // A carry in t0 after Montgomery multiplication means that we
4270    // should subtract multiples of n from our result in m.  We'll
4271    // keep doing that until there is no carry.
4272    void normalize(RegisterOrConstant len) {
4273      block_comment("normalize");
4274      // while (t0)
4275      //   t0 = sub(Pm_base, Pn_base, t0, len);
4276      Label loop, post, again;
4277      Register cnt = t1, i = t2; // Re-use registers; we're done with them now
4278      cbz(t0, post); {
4279        bind(again); {
4280          mov(i, zr);
4281          mov(cnt, len);
4282          ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4283          ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4284          subs(zr, zr, zr); // set carry flag, i.e. no borrow
4285          align(16);
4286          bind(loop); {
4287            sbcs(Rm, Rm, Rn);
4288            str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4289            add(i, i, 1);
4290            ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4291            ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4292            sub(cnt, cnt, 1);
4293          } cbnz(cnt, loop);
4294          sbc(t0, t0, zr);
4295        } cbnz(t0, again);
4296      } bind(post);
4297    }
4298
4299    // Move memory at s to d, reversing words.
4300    //    Increments d to end of copied memory
4301    //    Destroys tmp1, tmp2
4302    //    Preserves len
4303    //    Leaves s pointing to the address which was in d at start
4304    void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4305      assert(tmp1 < r19 && tmp2 < r19, "register corruption");
4306
4307      lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
4308      mov(tmp1, len);
4309      unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4310      sub(s, d, len, ext::uxtw, LogBytesPerWord);
4311    }
4312    // where
4313    void reverse1(Register d, Register s, Register tmp) {
4314      ldr(tmp, pre(s, -wordSize));
4315      ror(tmp, tmp, 32);
4316      str(tmp, post(d, wordSize));
4317    }
4318
4319    void step_squaring() {
4320      // An extra ACC
4321      step();
4322      acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4323    }
4324
4325    void last_squaring(RegisterOrConstant i) {
4326      Label dont;
4327      // if ((i & 1) == 0) {
4328      tbnz(i.as_register(), 0, dont); {
4329        // MACC(Ra, Rb, t0, t1, t2);
4330        // Ra = *++Pa;
4331        // Rb = *--Pb;
4332        umulh(Rhi_ab, Ra, Rb);
4333        mul(Rlo_ab, Ra, Rb);
4334        acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4335      } bind(dont);
4336    }
4337
4338    void extra_step_squaring() {
4339      acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4340
4341      // MACC(Rm, Rn, t0, t1, t2);
4342      // Rm = *++Pm;
4343      // Rn = *--Pn;
4344      umulh(Rhi_mn, Rm, Rn);
4345      mul(Rlo_mn, Rm, Rn);
4346      ldr(Rm, pre(Pm, wordSize));
4347      ldr(Rn, pre(Pn, -wordSize));
4348    }
4349
4350    void post1_squaring() {
4351      acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4352
4353      // *Pm = Rm = t0 * inv;
4354      mul(Rm, t0, inv);
4355      str(Rm, Address(Pm));
4356
4357      // MACC(Rm, Rn, t0, t1, t2);
4358      // t0 = t1; t1 = t2; t2 = 0;
4359      umulh(Rhi_mn, Rm, Rn);
4360
4361#ifndef PRODUCT
4362      // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4363      {
4364        mul(Rlo_mn, Rm, Rn);
4365        add(Rlo_mn, t0, Rlo_mn);
4366        Label ok;
4367        cbz(Rlo_mn, ok); {
4368          stop("broken Montgomery multiply");
4369        } bind(ok);
4370      }
4371#endif
4372      // We have very carefully set things up so that
4373      // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4374      // the lower half of Rm * Rn because we know the result already:
4375      // it must be -t0.  t0 + (-t0) must generate a carry iff
4376      // t0 != 0.  So, rather than do a mul and an adds we just set
4377      // the carry flag iff t0 is nonzero.
4378      //
4379      // mul(Rlo_mn, Rm, Rn);
4380      // adds(zr, t0, Rlo_mn);
4381      subs(zr, t0, 1); // Set carry iff t0 is nonzero
4382      adcs(t0, t1, Rhi_mn);
4383      adc(t1, t2, zr);
4384      mov(t2, zr);
4385    }
4386
4387    void acc(Register Rhi, Register Rlo,
4388             Register t0, Register t1, Register t2) {
4389      adds(t0, t0, Rlo);
4390      adcs(t1, t1, Rhi);
4391      adc(t2, t2, zr);
4392    }
4393
4394  public:
4395    /**
4396     * Fast Montgomery multiplication.  The derivation of the
4397     * algorithm is in A Cryptographic Library for the Motorola
4398     * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4399     *
4400     * Arguments:
4401     *
4402     * Inputs for multiplication:
4403     *   c_rarg0   - int array elements a
4404     *   c_rarg1   - int array elements b
4405     *   c_rarg2   - int array elements n (the modulus)
4406     *   c_rarg3   - int length
4407     *   c_rarg4   - int inv
4408     *   c_rarg5   - int array elements m (the result)
4409     *
4410     * Inputs for squaring:
4411     *   c_rarg0   - int array elements a
4412     *   c_rarg1   - int array elements n (the modulus)
4413     *   c_rarg2   - int length
4414     *   c_rarg3   - int inv
4415     *   c_rarg4   - int array elements m (the result)
4416     *
4417     */
4418    address generate_multiply() {
4419      Label argh, nothing;
4420      bind(argh);
4421      stop("MontgomeryMultiply total_allocation must be <= 8192");
4422
4423      align(CodeEntryAlignment);
4424      address entry = pc();
4425
4426      cbzw(Rlen, nothing);
4427
4428      enter();
4429
4430      // Make room.
4431      cmpw(Rlen, 512);
4432      br(Assembler::HI, argh);
4433      sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4434      andr(sp, Ra, -2 * wordSize);
4435
4436      lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4437
4438      {
4439        // Copy input args, reversing as we go.  We use Ra as a
4440        // temporary variable.
4441        reverse(Ra, Pa_base, Rlen, t0, t1);
4442        if (!_squaring)
4443          reverse(Ra, Pb_base, Rlen, t0, t1);
4444        reverse(Ra, Pn_base, Rlen, t0, t1);
4445      }
4446
4447      // Push all call-saved registers and also Pm_base which we'll need
4448      // at the end.
4449      save_regs();
4450
4451#ifndef PRODUCT
4452      // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4453      {
4454        ldr(Rn, Address(Pn_base, 0));
4455        mul(Rlo_mn, Rn, inv);
4456        cmp(Rlo_mn, -1);
4457        Label ok;
4458        br(EQ, ok); {
4459          stop("broken inverse in Montgomery multiply");
4460        } bind(ok);
4461      }
4462#endif
4463
4464      mov(Pm_base, Ra);
4465
4466      mov(t0, zr);
4467      mov(t1, zr);
4468      mov(t2, zr);
4469
4470      block_comment("for (int i = 0; i < len; i++) {");
4471      mov(Ri, zr); {
4472        Label loop, end;
4473        cmpw(Ri, Rlen);
4474        br(Assembler::GE, end);
4475
4476        bind(loop);
4477        pre1(Ri);
4478
4479        block_comment("  for (j = i; j; j--) {"); {
4480          movw(Rj, Ri);
4481          unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4482        } block_comment("  } // j");
4483
4484        post1();
4485        addw(Ri, Ri, 1);
4486        cmpw(Ri, Rlen);
4487        br(Assembler::LT, loop);
4488        bind(end);
4489        block_comment("} // i");
4490      }
4491
4492      block_comment("for (int i = len; i < 2*len; i++) {");
4493      mov(Ri, Rlen); {
4494        Label loop, end;
4495        cmpw(Ri, Rlen, Assembler::LSL, 1);
4496        br(Assembler::GE, end);
4497
4498        bind(loop);
4499        pre2(Ri, Rlen);
4500
4501        block_comment("  for (j = len*2-i-1; j; j--) {"); {
4502          lslw(Rj, Rlen, 1);
4503          subw(Rj, Rj, Ri);
4504          subw(Rj, Rj, 1);
4505          unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4506        } block_comment("  } // j");
4507
4508        post2(Ri, Rlen);
4509        addw(Ri, Ri, 1);
4510        cmpw(Ri, Rlen, Assembler::LSL, 1);
4511        br(Assembler::LT, loop);
4512        bind(end);
4513      }
4514      block_comment("} // i");
4515
4516      normalize(Rlen);
4517
4518      mov(Ra, Pm_base);  // Save Pm_base in Ra
4519      restore_regs();  // Restore caller's Pm_base
4520
4521      // Copy our result into caller's Pm_base
4522      reverse(Pm_base, Ra, Rlen, t0, t1);
4523
4524      leave();
4525      bind(nothing);
4526      ret(lr);
4527
4528      return entry;
4529    }
4530    // In C, approximately:
4531
4532    // void
4533    // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4534    //                     unsigned long Pn_base[], unsigned long Pm_base[],
4535    //                     unsigned long inv, int len) {
4536    //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4537    //   unsigned long *Pa, *Pb, *Pn, *Pm;
4538    //   unsigned long Ra, Rb, Rn, Rm;
4539
4540    //   int i;
4541
4542    //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4543
4544    //   for (i = 0; i < len; i++) {
4545    //     int j;
4546
4547    //     Pa = Pa_base;
4548    //     Pb = Pb_base + i;
4549    //     Pm = Pm_base;
4550    //     Pn = Pn_base + i;
4551
4552    //     Ra = *Pa;
4553    //     Rb = *Pb;
4554    //     Rm = *Pm;
4555    //     Rn = *Pn;
4556
4557    //     int iters = i;
4558    //     for (j = 0; iters--; j++) {
4559    //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4560    //       MACC(Ra, Rb, t0, t1, t2);
4561    //       Ra = *++Pa;
4562    //       Rb = *--Pb;
4563    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4564    //       MACC(Rm, Rn, t0, t1, t2);
4565    //       Rm = *++Pm;
4566    //       Rn = *--Pn;
4567    //     }
4568
4569    //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4570    //     MACC(Ra, Rb, t0, t1, t2);
4571    //     *Pm = Rm = t0 * inv;
4572    //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4573    //     MACC(Rm, Rn, t0, t1, t2);
4574
4575    //     assert(t0 == 0, "broken Montgomery multiply");
4576
4577    //     t0 = t1; t1 = t2; t2 = 0;
4578    //   }
4579
4580    //   for (i = len; i < 2*len; i++) {
4581    //     int j;
4582
4583    //     Pa = Pa_base + i-len;
4584    //     Pb = Pb_base + len;
4585    //     Pm = Pm_base + i-len;
4586    //     Pn = Pn_base + len;
4587
4588    //     Ra = *++Pa;
4589    //     Rb = *--Pb;
4590    //     Rm = *++Pm;
4591    //     Rn = *--Pn;
4592
4593    //     int iters = len*2-i-1;
4594    //     for (j = i-len+1; iters--; j++) {
4595    //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4596    //       MACC(Ra, Rb, t0, t1, t2);
4597    //       Ra = *++Pa;
4598    //       Rb = *--Pb;
4599    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4600    //       MACC(Rm, Rn, t0, t1, t2);
4601    //       Rm = *++Pm;
4602    //       Rn = *--Pn;
4603    //     }
4604
4605    //     Pm_base[i-len] = t0;
4606    //     t0 = t1; t1 = t2; t2 = 0;
4607    //   }
4608
4609    //   while (t0)
4610    //     t0 = sub(Pm_base, Pn_base, t0, len);
4611    // }
4612
4613    /**
4614     * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4615     * multiplies than Montgomery multiplication so it should be up to
4616     * 25% faster.  However, its loop control is more complex and it
4617     * may actually run slower on some machines.
4618     *
4619     * Arguments:
4620     *
4621     * Inputs:
4622     *   c_rarg0   - int array elements a
4623     *   c_rarg1   - int array elements n (the modulus)
4624     *   c_rarg2   - int length
4625     *   c_rarg3   - int inv
4626     *   c_rarg4   - int array elements m (the result)
4627     *
4628     */
4629    address generate_square() {
4630      Label argh;
4631      bind(argh);
4632      stop("MontgomeryMultiply total_allocation must be <= 8192");
4633
4634      align(CodeEntryAlignment);
4635      address entry = pc();
4636
4637      enter();
4638
4639      // Make room.
4640      cmpw(Rlen, 512);
4641      br(Assembler::HI, argh);
4642      sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4643      andr(sp, Ra, -2 * wordSize);
4644
4645      lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4646
4647      {
4648        // Copy input args, reversing as we go.  We use Ra as a
4649        // temporary variable.
4650        reverse(Ra, Pa_base, Rlen, t0, t1);
4651        reverse(Ra, Pn_base, Rlen, t0, t1);
4652      }
4653
4654      // Push all call-saved registers and also Pm_base which we'll need
4655      // at the end.
4656      save_regs();
4657
4658      mov(Pm_base, Ra);
4659
4660      mov(t0, zr);
4661      mov(t1, zr);
4662      mov(t2, zr);
4663
4664      block_comment("for (int i = 0; i < len; i++) {");
4665      mov(Ri, zr); {
4666        Label loop, end;
4667        bind(loop);
4668        cmp(Ri, Rlen);
4669        br(Assembler::GE, end);
4670
4671        pre1(Ri);
4672
4673        block_comment("for (j = (i+1)/2; j; j--) {"); {
4674          add(Rj, Ri, 1);
4675          lsr(Rj, Rj, 1);
4676          unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4677        } block_comment("  } // j");
4678
4679        last_squaring(Ri);
4680
4681        block_comment("  for (j = i/2; j; j--) {"); {
4682          lsr(Rj, Ri, 1);
4683          unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4684        } block_comment("  } // j");
4685
4686        post1_squaring();
4687        add(Ri, Ri, 1);
4688        cmp(Ri, Rlen);
4689        br(Assembler::LT, loop);
4690
4691        bind(end);
4692        block_comment("} // i");
4693      }
4694
4695      block_comment("for (int i = len; i < 2*len; i++) {");
4696      mov(Ri, Rlen); {
4697        Label loop, end;
4698        bind(loop);
4699        cmp(Ri, Rlen, Assembler::LSL, 1);
4700        br(Assembler::GE, end);
4701
4702        pre2(Ri, Rlen);
4703
4704        block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4705          lsl(Rj, Rlen, 1);
4706          sub(Rj, Rj, Ri);
4707          sub(Rj, Rj, 1);
4708          lsr(Rj, Rj, 1);
4709          unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4710        } block_comment("  } // j");
4711
4712        last_squaring(Ri);
4713
4714        block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4715          lsl(Rj, Rlen, 1);
4716          sub(Rj, Rj, Ri);
4717          lsr(Rj, Rj, 1);
4718          unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4719        } block_comment("  } // j");
4720
4721        post2(Ri, Rlen);
4722        add(Ri, Ri, 1);
4723        cmp(Ri, Rlen, Assembler::LSL, 1);
4724
4725        br(Assembler::LT, loop);
4726        bind(end);
4727        block_comment("} // i");
4728      }
4729
4730      normalize(Rlen);
4731
4732      mov(Ra, Pm_base);  // Save Pm_base in Ra
4733      restore_regs();  // Restore caller's Pm_base
4734
4735      // Copy our result into caller's Pm_base
4736      reverse(Pm_base, Ra, Rlen, t0, t1);
4737
4738      leave();
4739      ret(lr);
4740
4741      return entry;
4742    }
4743    // In C, approximately:
4744
4745    // void
4746    // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4747    //                   unsigned long Pm_base[], unsigned long inv, int len) {
4748    //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4749    //   unsigned long *Pa, *Pb, *Pn, *Pm;
4750    //   unsigned long Ra, Rb, Rn, Rm;
4751
4752    //   int i;
4753
4754    //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4755
4756    //   for (i = 0; i < len; i++) {
4757    //     int j;
4758
4759    //     Pa = Pa_base;
4760    //     Pb = Pa_base + i;
4761    //     Pm = Pm_base;
4762    //     Pn = Pn_base + i;
4763
4764    //     Ra = *Pa;
4765    //     Rb = *Pb;
4766    //     Rm = *Pm;
4767    //     Rn = *Pn;
4768
4769    //     int iters = (i+1)/2;
4770    //     for (j = 0; iters--; j++) {
4771    //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4772    //       MACC2(Ra, Rb, t0, t1, t2);
4773    //       Ra = *++Pa;
4774    //       Rb = *--Pb;
4775    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4776    //       MACC(Rm, Rn, t0, t1, t2);
4777    //       Rm = *++Pm;
4778    //       Rn = *--Pn;
4779    //     }
4780    //     if ((i & 1) == 0) {
4781    //       assert(Ra == Pa_base[j], "must be");
4782    //       MACC(Ra, Ra, t0, t1, t2);
4783    //     }
4784    //     iters = i/2;
4785    //     assert(iters == i-j, "must be");
4786    //     for (; iters--; j++) {
4787    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4788    //       MACC(Rm, Rn, t0, t1, t2);
4789    //       Rm = *++Pm;
4790    //       Rn = *--Pn;
4791    //     }
4792
4793    //     *Pm = Rm = t0 * inv;
4794    //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4795    //     MACC(Rm, Rn, t0, t1, t2);
4796
4797    //     assert(t0 == 0, "broken Montgomery multiply");
4798
4799    //     t0 = t1; t1 = t2; t2 = 0;
4800    //   }
4801
4802    //   for (i = len; i < 2*len; i++) {
4803    //     int start = i-len+1;
4804    //     int end = start + (len - start)/2;
4805    //     int j;
4806
4807    //     Pa = Pa_base + i-len;
4808    //     Pb = Pa_base + len;
4809    //     Pm = Pm_base + i-len;
4810    //     Pn = Pn_base + len;
4811
4812    //     Ra = *++Pa;
4813    //     Rb = *--Pb;
4814    //     Rm = *++Pm;
4815    //     Rn = *--Pn;
4816
4817    //     int iters = (2*len-i-1)/2;
4818    //     assert(iters == end-start, "must be");
4819    //     for (j = start; iters--; j++) {
4820    //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4821    //       MACC2(Ra, Rb, t0, t1, t2);
4822    //       Ra = *++Pa;
4823    //       Rb = *--Pb;
4824    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4825    //       MACC(Rm, Rn, t0, t1, t2);
4826    //       Rm = *++Pm;
4827    //       Rn = *--Pn;
4828    //     }
4829    //     if ((i & 1) == 0) {
4830    //       assert(Ra == Pa_base[j], "must be");
4831    //       MACC(Ra, Ra, t0, t1, t2);
4832    //     }
4833    //     iters =  (2*len-i)/2;
4834    //     assert(iters == len-j, "must be");
4835    //     for (; iters--; j++) {
4836    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4837    //       MACC(Rm, Rn, t0, t1, t2);
4838    //       Rm = *++Pm;
4839    //       Rn = *--Pn;
4840    //     }
4841    //     Pm_base[i-len] = t0;
4842    //     t0 = t1; t1 = t2; t2 = 0;
4843    //   }
4844
4845    //   while (t0)
4846    //     t0 = sub(Pm_base, Pn_base, t0, len);
4847    // }
4848  };
4849
4850
4851  // Initialization
4852  void generate_initial() {
4853    // Generate initial stubs and initializes the entry points
4854
4855    // entry points that exist in all platforms Note: This is code
4856    // that could be shared among different platforms - however the
4857    // benefit seems to be smaller than the disadvantage of having a
4858    // much more complicated generator structure. See also comment in
4859    // stubRoutines.hpp.
4860
4861    StubRoutines::_forward_exception_entry = generate_forward_exception();
4862
4863    StubRoutines::_call_stub_entry =
4864      generate_call_stub(StubRoutines::_call_stub_return_address);
4865
4866    // is referenced by megamorphic call
4867    StubRoutines::_catch_exception_entry = generate_catch_exception();
4868
4869    // Build this early so it's available for the interpreter.
4870    StubRoutines::_throw_StackOverflowError_entry =
4871      generate_throw_exception("StackOverflowError throw_exception",
4872                               CAST_FROM_FN_PTR(address,
4873                                                SharedRuntime::throw_StackOverflowError));
4874    StubRoutines::_throw_delayed_StackOverflowError_entry =
4875      generate_throw_exception("delayed StackOverflowError throw_exception",
4876                               CAST_FROM_FN_PTR(address,
4877                                                SharedRuntime::throw_delayed_StackOverflowError));
4878    if (UseCRC32Intrinsics) {
4879      // set table address before stub generation which use it
4880      StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4881      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4882    }
4883  }
4884
4885  void generate_all() {
4886    // support for verify_oop (must happen after universe_init)
4887    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4888    StubRoutines::_throw_AbstractMethodError_entry =
4889      generate_throw_exception("AbstractMethodError throw_exception",
4890                               CAST_FROM_FN_PTR(address,
4891                                                SharedRuntime::
4892                                                throw_AbstractMethodError));
4893
4894    StubRoutines::_throw_IncompatibleClassChangeError_entry =
4895      generate_throw_exception("IncompatibleClassChangeError throw_exception",
4896                               CAST_FROM_FN_PTR(address,
4897                                                SharedRuntime::
4898                                                throw_IncompatibleClassChangeError));
4899
4900    StubRoutines::_throw_NullPointerException_at_call_entry =
4901      generate_throw_exception("NullPointerException at call throw_exception",
4902                               CAST_FROM_FN_PTR(address,
4903                                                SharedRuntime::
4904                                                throw_NullPointerException_at_call));
4905
4906    // arraycopy stubs used by compilers
4907    generate_arraycopy_stubs();
4908
4909    // has negatives stub for large arrays.
4910    StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
4911
4912    if (UseMultiplyToLenIntrinsic) {
4913      StubRoutines::_multiplyToLen = generate_multiplyToLen();
4914    }
4915
4916    if (UseMontgomeryMultiplyIntrinsic) {
4917      StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4918      MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4919      StubRoutines::_montgomeryMultiply = g.generate_multiply();
4920    }
4921
4922    if (UseMontgomerySquareIntrinsic) {
4923      StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4924      MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4925      // We use generate_multiply() rather than generate_square()
4926      // because it's faster for the sizes of modulus we care about.
4927      StubRoutines::_montgomerySquare = g.generate_multiply();
4928    }
4929
4930#ifndef BUILTIN_SIM
4931    // generate GHASH intrinsics code
4932    if (UseGHASHIntrinsics) {
4933      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4934    }
4935
4936    if (UseAESIntrinsics) {
4937      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4938      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4939      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4940      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4941    }
4942
4943    if (UseSHA1Intrinsics) {
4944      StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4945      StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4946    }
4947    if (UseSHA256Intrinsics) {
4948      StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4949      StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4950    }
4951
4952    if (UseCRC32CIntrinsics) {
4953      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4954    }
4955
4956    // generate Adler32 intrinsics code
4957    if (UseAdler32Intrinsics) {
4958      StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
4959    }
4960
4961    // Safefetch stubs.
4962    generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4963                                                       &StubRoutines::_safefetch32_fault_pc,
4964                                                       &StubRoutines::_safefetch32_continuation_pc);
4965    generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4966                                                       &StubRoutines::_safefetchN_fault_pc,
4967                                                       &StubRoutines::_safefetchN_continuation_pc);
4968#endif
4969    StubRoutines::aarch64::set_completed();
4970  }
4971
4972 public:
4973  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4974    if (all) {
4975      generate_all();
4976    } else {
4977      generate_initial();
4978    }
4979  }
4980}; // end class declaration
4981
4982void StubGenerator_generate(CodeBuffer* code, bool all) {
4983  StubGenerator g(code, all);
4984}
4985