1/*
2 * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "asm/macroAssembler.hpp"
27#include "asm/macroAssembler.inline.hpp"
28#include "interpreter/interpreter.hpp"
29#include "nativeInst_x86.hpp"
30#include "oops/instanceOop.hpp"
31#include "oops/method.hpp"
32#include "oops/objArrayKlass.hpp"
33#include "oops/oop.inline.hpp"
34#include "prims/methodHandles.hpp"
35#include "runtime/frame.inline.hpp"
36#include "runtime/handles.inline.hpp"
37#include "runtime/sharedRuntime.hpp"
38#include "runtime/stubCodeGenerator.hpp"
39#include "runtime/stubRoutines.hpp"
40#include "runtime/thread.inline.hpp"
41#ifdef COMPILER2
42#include "opto/runtime.hpp"
43#endif
44
45// Declaration and definition of StubGenerator (no .hpp file).
46// For a more detailed description of the stub routine structure
47// see the comment in stubRoutines.hpp
48
49#define __ _masm->
50#define a__ ((Assembler*)_masm)->
51
52#ifdef PRODUCT
53#define BLOCK_COMMENT(str) /* nothing */
54#else
55#define BLOCK_COMMENT(str) __ block_comment(str)
56#endif
57
58#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
59
60const int MXCSR_MASK  = 0xFFC0;  // Mask out any pending exceptions
61const int FPU_CNTRL_WRD_MASK = 0xFFFF;
62
63// -------------------------------------------------------------------------------------------------------------------------
64// Stub Code definitions
65
66class StubGenerator: public StubCodeGenerator {
67 private:
68
69#ifdef PRODUCT
70#define inc_counter_np(counter) ((void)0)
71#else
72  void inc_counter_np_(int& counter) {
73    __ incrementl(ExternalAddress((address)&counter));
74  }
75#define inc_counter_np(counter) \
76  BLOCK_COMMENT("inc_counter " #counter); \
77  inc_counter_np_(counter);
78#endif //PRODUCT
79
80  void inc_copy_counter_np(BasicType t) {
81#ifndef PRODUCT
82    switch (t) {
83    case T_BYTE:    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); return;
84    case T_SHORT:   inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); return;
85    case T_INT:     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); return;
86    case T_LONG:    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); return;
87    case T_OBJECT:  inc_counter_np(SharedRuntime::_oop_array_copy_ctr); return;
88    }
89    ShouldNotReachHere();
90#endif //PRODUCT
91  }
92
93  //------------------------------------------------------------------------------------------------------------------------
94  // Call stubs are used to call Java from C
95  //
96  //    [ return_from_Java     ] <--- rsp
97  //    [ argument word n      ]
98  //      ...
99  // -N [ argument word 1      ]
100  // -7 [ Possible padding for stack alignment ]
101  // -6 [ Possible padding for stack alignment ]
102  // -5 [ Possible padding for stack alignment ]
103  // -4 [ mxcsr save           ] <--- rsp_after_call
104  // -3 [ saved rbx,            ]
105  // -2 [ saved rsi            ]
106  // -1 [ saved rdi            ]
107  //  0 [ saved rbp,            ] <--- rbp,
108  //  1 [ return address       ]
109  //  2 [ ptr. to call wrapper ]
110  //  3 [ result               ]
111  //  4 [ result_type          ]
112  //  5 [ method               ]
113  //  6 [ entry_point          ]
114  //  7 [ parameters           ]
115  //  8 [ parameter_size       ]
116  //  9 [ thread               ]
117
118
119  address generate_call_stub(address& return_address) {
120    StubCodeMark mark(this, "StubRoutines", "call_stub");
121    address start = __ pc();
122
123    // stub code parameters / addresses
124    assert(frame::entry_frame_call_wrapper_offset == 2, "adjust this code");
125    bool  sse_save = false;
126    const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_catch_exception()!
127    const int     locals_count_in_bytes  (4*wordSize);
128    const Address mxcsr_save    (rbp, -4 * wordSize);
129    const Address saved_rbx     (rbp, -3 * wordSize);
130    const Address saved_rsi     (rbp, -2 * wordSize);
131    const Address saved_rdi     (rbp, -1 * wordSize);
132    const Address result        (rbp,  3 * wordSize);
133    const Address result_type   (rbp,  4 * wordSize);
134    const Address method        (rbp,  5 * wordSize);
135    const Address entry_point   (rbp,  6 * wordSize);
136    const Address parameters    (rbp,  7 * wordSize);
137    const Address parameter_size(rbp,  8 * wordSize);
138    const Address thread        (rbp,  9 * wordSize); // same as in generate_catch_exception()!
139    sse_save =  UseSSE > 0;
140
141    // stub code
142    __ enter();
143    __ movptr(rcx, parameter_size);              // parameter counter
144    __ shlptr(rcx, Interpreter::logStackElementSize); // convert parameter count to bytes
145    __ addptr(rcx, locals_count_in_bytes);       // reserve space for register saves
146    __ subptr(rsp, rcx);
147    __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
148
149    // save rdi, rsi, & rbx, according to C calling conventions
150    __ movptr(saved_rdi, rdi);
151    __ movptr(saved_rsi, rsi);
152    __ movptr(saved_rbx, rbx);
153
154    // provide initial value for required masks
155    if (UseAVX > 2) {
156      __ movl(rbx, 0xffff);
157      __ kmovwl(k1, rbx);
158    }
159
160    // save and initialize %mxcsr
161    if (sse_save) {
162      Label skip_ldmx;
163      __ stmxcsr(mxcsr_save);
164      __ movl(rax, mxcsr_save);
165      __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
166      ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
167      __ cmp32(rax, mxcsr_std);
168      __ jcc(Assembler::equal, skip_ldmx);
169      __ ldmxcsr(mxcsr_std);
170      __ bind(skip_ldmx);
171    }
172
173    // make sure the control word is correct.
174    __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
175
176#ifdef ASSERT
177    // make sure we have no pending exceptions
178    { Label L;
179      __ movptr(rcx, thread);
180      __ cmpptr(Address(rcx, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
181      __ jcc(Assembler::equal, L);
182      __ stop("StubRoutines::call_stub: entered with pending exception");
183      __ bind(L);
184    }
185#endif
186
187    // pass parameters if any
188    BLOCK_COMMENT("pass parameters if any");
189    Label parameters_done;
190    __ movl(rcx, parameter_size);  // parameter counter
191    __ testl(rcx, rcx);
192    __ jcc(Assembler::zero, parameters_done);
193
194    // parameter passing loop
195
196    Label loop;
197    // Copy Java parameters in reverse order (receiver last)
198    // Note that the argument order is inverted in the process
199    // source is rdx[rcx: N-1..0]
200    // dest   is rsp[rbx: 0..N-1]
201
202    __ movptr(rdx, parameters);          // parameter pointer
203    __ xorptr(rbx, rbx);
204
205    __ BIND(loop);
206
207    // get parameter
208    __ movptr(rax, Address(rdx, rcx, Interpreter::stackElementScale(), -wordSize));
209    __ movptr(Address(rsp, rbx, Interpreter::stackElementScale(),
210                    Interpreter::expr_offset_in_bytes(0)), rax);          // store parameter
211    __ increment(rbx);
212    __ decrement(rcx);
213    __ jcc(Assembler::notZero, loop);
214
215    // call Java function
216    __ BIND(parameters_done);
217    __ movptr(rbx, method);           // get Method*
218    __ movptr(rax, entry_point);      // get entry_point
219    __ mov(rsi, rsp);                 // set sender sp
220    BLOCK_COMMENT("call Java function");
221    __ call(rax);
222
223    BLOCK_COMMENT("call_stub_return_address:");
224    return_address = __ pc();
225
226#ifdef COMPILER2
227    {
228      Label L_skip;
229      if (UseSSE >= 2) {
230        __ verify_FPU(0, "call_stub_return");
231      } else {
232        for (int i = 1; i < 8; i++) {
233          __ ffree(i);
234        }
235
236        // UseSSE <= 1 so double result should be left on TOS
237        __ movl(rsi, result_type);
238        __ cmpl(rsi, T_DOUBLE);
239        __ jcc(Assembler::equal, L_skip);
240        if (UseSSE == 0) {
241          // UseSSE == 0 so float result should be left on TOS
242          __ cmpl(rsi, T_FLOAT);
243          __ jcc(Assembler::equal, L_skip);
244        }
245        __ ffree(0);
246      }
247      __ BIND(L_skip);
248    }
249#endif // COMPILER2
250
251    // store result depending on type
252    // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
253    __ movptr(rdi, result);
254    Label is_long, is_float, is_double, exit;
255    __ movl(rsi, result_type);
256    __ cmpl(rsi, T_LONG);
257    __ jcc(Assembler::equal, is_long);
258    __ cmpl(rsi, T_FLOAT);
259    __ jcc(Assembler::equal, is_float);
260    __ cmpl(rsi, T_DOUBLE);
261    __ jcc(Assembler::equal, is_double);
262
263    // handle T_INT case
264    __ movl(Address(rdi, 0), rax);
265    __ BIND(exit);
266
267    // check that FPU stack is empty
268    __ verify_FPU(0, "generate_call_stub");
269
270    // pop parameters
271    __ lea(rsp, rsp_after_call);
272
273    // restore %mxcsr
274    if (sse_save) {
275      __ ldmxcsr(mxcsr_save);
276    }
277
278    // restore rdi, rsi and rbx,
279    __ movptr(rbx, saved_rbx);
280    __ movptr(rsi, saved_rsi);
281    __ movptr(rdi, saved_rdi);
282    __ addptr(rsp, 4*wordSize);
283
284    // return
285    __ pop(rbp);
286    __ ret(0);
287
288    // handle return types different from T_INT
289    __ BIND(is_long);
290    __ movl(Address(rdi, 0 * wordSize), rax);
291    __ movl(Address(rdi, 1 * wordSize), rdx);
292    __ jmp(exit);
293
294    __ BIND(is_float);
295    // interpreter uses xmm0 for return values
296    if (UseSSE >= 1) {
297      __ movflt(Address(rdi, 0), xmm0);
298    } else {
299      __ fstp_s(Address(rdi, 0));
300    }
301    __ jmp(exit);
302
303    __ BIND(is_double);
304    // interpreter uses xmm0 for return values
305    if (UseSSE >= 2) {
306      __ movdbl(Address(rdi, 0), xmm0);
307    } else {
308      __ fstp_d(Address(rdi, 0));
309    }
310    __ jmp(exit);
311
312    return start;
313  }
314
315
316  //------------------------------------------------------------------------------------------------------------------------
317  // Return point for a Java call if there's an exception thrown in Java code.
318  // The exception is caught and transformed into a pending exception stored in
319  // JavaThread that can be tested from within the VM.
320  //
321  // Note: Usually the parameters are removed by the callee. In case of an exception
322  //       crossing an activation frame boundary, that is not the case if the callee
323  //       is compiled code => need to setup the rsp.
324  //
325  // rax,: exception oop
326
327  address generate_catch_exception() {
328    StubCodeMark mark(this, "StubRoutines", "catch_exception");
329    const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_call_stub()!
330    const Address thread        (rbp,  9 * wordSize); // same as in generate_call_stub()!
331    address start = __ pc();
332
333    // get thread directly
334    __ movptr(rcx, thread);
335#ifdef ASSERT
336    // verify that threads correspond
337    { Label L;
338      __ get_thread(rbx);
339      __ cmpptr(rbx, rcx);
340      __ jcc(Assembler::equal, L);
341      __ stop("StubRoutines::catch_exception: threads must correspond");
342      __ bind(L);
343    }
344#endif
345    // set pending exception
346    __ verify_oop(rax);
347    __ movptr(Address(rcx, Thread::pending_exception_offset()), rax          );
348    __ lea(Address(rcx, Thread::exception_file_offset   ()),
349           ExternalAddress((address)__FILE__));
350    __ movl(Address(rcx, Thread::exception_line_offset   ()), __LINE__ );
351    // complete return to VM
352    assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
353    __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
354
355    return start;
356  }
357
358
359  //------------------------------------------------------------------------------------------------------------------------
360  // Continuation point for runtime calls returning with a pending exception.
361  // The pending exception check happened in the runtime or native call stub.
362  // The pending exception in Thread is converted into a Java-level exception.
363  //
364  // Contract with Java-level exception handlers:
365  // rax: exception
366  // rdx: throwing pc
367  //
368  // NOTE: At entry of this stub, exception-pc must be on stack !!
369
370  address generate_forward_exception() {
371    StubCodeMark mark(this, "StubRoutines", "forward exception");
372    address start = __ pc();
373    const Register thread = rcx;
374
375    // other registers used in this stub
376    const Register exception_oop = rax;
377    const Register handler_addr  = rbx;
378    const Register exception_pc  = rdx;
379
380    // Upon entry, the sp points to the return address returning into Java
381    // (interpreted or compiled) code; i.e., the return address becomes the
382    // throwing pc.
383    //
384    // Arguments pushed before the runtime call are still on the stack but
385    // the exception handler will reset the stack pointer -> ignore them.
386    // A potential result in registers can be ignored as well.
387
388#ifdef ASSERT
389    // make sure this code is only executed if there is a pending exception
390    { Label L;
391      __ get_thread(thread);
392      __ cmpptr(Address(thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
393      __ jcc(Assembler::notEqual, L);
394      __ stop("StubRoutines::forward exception: no pending exception (1)");
395      __ bind(L);
396    }
397#endif
398
399    // compute exception handler into rbx,
400    __ get_thread(thread);
401    __ movptr(exception_pc, Address(rsp, 0));
402    BLOCK_COMMENT("call exception_handler_for_return_address");
403    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, exception_pc);
404    __ mov(handler_addr, rax);
405
406    // setup rax & rdx, remove return address & clear pending exception
407    __ get_thread(thread);
408    __ pop(exception_pc);
409    __ movptr(exception_oop, Address(thread, Thread::pending_exception_offset()));
410    __ movptr(Address(thread, Thread::pending_exception_offset()), NULL_WORD);
411
412#ifdef ASSERT
413    // make sure exception is set
414    { Label L;
415      __ testptr(exception_oop, exception_oop);
416      __ jcc(Assembler::notEqual, L);
417      __ stop("StubRoutines::forward exception: no pending exception (2)");
418      __ bind(L);
419    }
420#endif
421
422    // Verify that there is really a valid exception in RAX.
423    __ verify_oop(exception_oop);
424
425    // continue at exception handler (return address removed)
426    // rax: exception
427    // rbx: exception handler
428    // rdx: throwing pc
429    __ jmp(handler_addr);
430
431    return start;
432  }
433
434
435  //----------------------------------------------------------------------------------------------------
436  // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest)
437  //
438  // xchg exists as far back as 8086, lock needed for MP only
439  // Stack layout immediately after call:
440  //
441  // 0 [ret addr ] <--- rsp
442  // 1 [  ex     ]
443  // 2 [  dest   ]
444  //
445  // Result:   *dest <- ex, return (old *dest)
446  //
447  // Note: win32 does not currently use this code
448
449  address generate_atomic_xchg() {
450    StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
451    address start = __ pc();
452
453    __ push(rdx);
454    Address exchange(rsp, 2 * wordSize);
455    Address dest_addr(rsp, 3 * wordSize);
456    __ movl(rax, exchange);
457    __ movptr(rdx, dest_addr);
458    __ xchgl(rax, Address(rdx, 0));
459    __ pop(rdx);
460    __ ret(0);
461
462    return start;
463  }
464
465  //----------------------------------------------------------------------------------------------------
466  // Support for void verify_mxcsr()
467  //
468  // This routine is used with -Xcheck:jni to verify that native
469  // JNI code does not return to Java code without restoring the
470  // MXCSR register to our expected state.
471
472
473  address generate_verify_mxcsr() {
474    StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
475    address start = __ pc();
476
477    const Address mxcsr_save(rsp, 0);
478
479    if (CheckJNICalls && UseSSE > 0 ) {
480      Label ok_ret;
481      ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
482      __ push(rax);
483      __ subptr(rsp, wordSize);      // allocate a temp location
484      __ stmxcsr(mxcsr_save);
485      __ movl(rax, mxcsr_save);
486      __ andl(rax, MXCSR_MASK);
487      __ cmp32(rax, mxcsr_std);
488      __ jcc(Assembler::equal, ok_ret);
489
490      __ warn("MXCSR changed by native JNI code.");
491
492      __ ldmxcsr(mxcsr_std);
493
494      __ bind(ok_ret);
495      __ addptr(rsp, wordSize);
496      __ pop(rax);
497    }
498
499    __ ret(0);
500
501    return start;
502  }
503
504
505  //---------------------------------------------------------------------------
506  // Support for void verify_fpu_cntrl_wrd()
507  //
508  // This routine is used with -Xcheck:jni to verify that native
509  // JNI code does not return to Java code without restoring the
510  // FP control word to our expected state.
511
512  address generate_verify_fpu_cntrl_wrd() {
513    StubCodeMark mark(this, "StubRoutines", "verify_spcw");
514    address start = __ pc();
515
516    const Address fpu_cntrl_wrd_save(rsp, 0);
517
518    if (CheckJNICalls) {
519      Label ok_ret;
520      __ push(rax);
521      __ subptr(rsp, wordSize);      // allocate a temp location
522      __ fnstcw(fpu_cntrl_wrd_save);
523      __ movl(rax, fpu_cntrl_wrd_save);
524      __ andl(rax, FPU_CNTRL_WRD_MASK);
525      ExternalAddress fpu_std(StubRoutines::addr_fpu_cntrl_wrd_std());
526      __ cmp32(rax, fpu_std);
527      __ jcc(Assembler::equal, ok_ret);
528
529      __ warn("Floating point control word changed by native JNI code.");
530
531      __ fldcw(fpu_std);
532
533      __ bind(ok_ret);
534      __ addptr(rsp, wordSize);
535      __ pop(rax);
536    }
537
538    __ ret(0);
539
540    return start;
541  }
542
543  //---------------------------------------------------------------------------
544  // Wrapper for slow-case handling of double-to-integer conversion
545  // d2i or f2i fast case failed either because it is nan or because
546  // of under/overflow.
547  // Input:  FPU TOS: float value
548  // Output: rax, (rdx): integer (long) result
549
550  address generate_d2i_wrapper(BasicType t, address fcn) {
551    StubCodeMark mark(this, "StubRoutines", "d2i_wrapper");
552    address start = __ pc();
553
554  // Capture info about frame layout
555  enum layout { FPUState_off         = 0,
556                rbp_off              = FPUStateSizeInWords,
557                rdi_off,
558                rsi_off,
559                rcx_off,
560                rbx_off,
561                saved_argument_off,
562                saved_argument_off2, // 2nd half of double
563                framesize
564  };
565
566  assert(FPUStateSizeInWords == 27, "update stack layout");
567
568    // Save outgoing argument to stack across push_FPU_state()
569    __ subptr(rsp, wordSize * 2);
570    __ fstp_d(Address(rsp, 0));
571
572    // Save CPU & FPU state
573    __ push(rbx);
574    __ push(rcx);
575    __ push(rsi);
576    __ push(rdi);
577    __ push(rbp);
578    __ push_FPU_state();
579
580    // push_FPU_state() resets the FP top of stack
581    // Load original double into FP top of stack
582    __ fld_d(Address(rsp, saved_argument_off * wordSize));
583    // Store double into stack as outgoing argument
584    __ subptr(rsp, wordSize*2);
585    __ fst_d(Address(rsp, 0));
586
587    // Prepare FPU for doing math in C-land
588    __ empty_FPU_stack();
589    // Call the C code to massage the double.  Result in EAX
590    if (t == T_INT)
591      { BLOCK_COMMENT("SharedRuntime::d2i"); }
592    else if (t == T_LONG)
593      { BLOCK_COMMENT("SharedRuntime::d2l"); }
594    __ call_VM_leaf( fcn, 2 );
595
596    // Restore CPU & FPU state
597    __ pop_FPU_state();
598    __ pop(rbp);
599    __ pop(rdi);
600    __ pop(rsi);
601    __ pop(rcx);
602    __ pop(rbx);
603    __ addptr(rsp, wordSize * 2);
604
605    __ ret(0);
606
607    return start;
608  }
609
610
611  //----------------------------------------------------------------------------------------------------
612  // Non-destructive plausibility checks for oops
613
614  address generate_verify_oop() {
615    StubCodeMark mark(this, "StubRoutines", "verify_oop");
616    address start = __ pc();
617
618    // Incoming arguments on stack after saving rax,:
619    //
620    // [tos    ]: saved rdx
621    // [tos + 1]: saved EFLAGS
622    // [tos + 2]: return address
623    // [tos + 3]: char* error message
624    // [tos + 4]: oop   object to verify
625    // [tos + 5]: saved rax, - saved by caller and bashed
626
627    Label exit, error;
628    __ pushf();
629    __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
630    __ push(rdx);                                // save rdx
631    // make sure object is 'reasonable'
632    __ movptr(rax, Address(rsp, 4 * wordSize));    // get object
633    __ testptr(rax, rax);
634    __ jcc(Assembler::zero, exit);               // if obj is NULL it is ok
635
636    // Check if the oop is in the right area of memory
637    const int oop_mask = Universe::verify_oop_mask();
638    const int oop_bits = Universe::verify_oop_bits();
639    __ mov(rdx, rax);
640    __ andptr(rdx, oop_mask);
641    __ cmpptr(rdx, oop_bits);
642    __ jcc(Assembler::notZero, error);
643
644    // make sure klass is 'reasonable', which is not zero.
645    __ movptr(rax, Address(rax, oopDesc::klass_offset_in_bytes())); // get klass
646    __ testptr(rax, rax);
647    __ jcc(Assembler::zero, error);              // if klass is NULL it is broken
648
649    // return if everything seems ok
650    __ bind(exit);
651    __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
652    __ pop(rdx);                                 // restore rdx
653    __ popf();                                   // restore EFLAGS
654    __ ret(3 * wordSize);                        // pop arguments
655
656    // handle errors
657    __ bind(error);
658    __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
659    __ pop(rdx);                                 // get saved rdx back
660    __ popf();                                   // get saved EFLAGS off stack -- will be ignored
661    __ pusha();                                  // push registers (eip = return address & msg are already pushed)
662    BLOCK_COMMENT("call MacroAssembler::debug");
663    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
664    __ popa();
665    __ ret(3 * wordSize);                        // pop arguments
666    return start;
667  }
668
669  //
670  //  Generate pre-barrier for array stores
671  //
672  //  Input:
673  //     start   -  starting address
674  //     count   -  element count
675  void  gen_write_ref_array_pre_barrier(Register start, Register count, bool uninitialized_target) {
676    assert_different_registers(start, count);
677    BarrierSet* bs = Universe::heap()->barrier_set();
678    switch (bs->kind()) {
679      case BarrierSet::G1SATBCTLogging:
680        // With G1, don't generate the call if we statically know that the target in uninitialized
681        if (!uninitialized_target) {
682           __ pusha();                      // push registers
683           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre),
684                           start, count);
685           __ popa();
686         }
687        break;
688      case BarrierSet::CardTableForRS:
689      case BarrierSet::CardTableExtension:
690      case BarrierSet::ModRef:
691        break;
692      default      :
693        ShouldNotReachHere();
694
695    }
696  }
697
698
699  //
700  // Generate a post-barrier for an array store
701  //
702  //     start    -  starting address
703  //     count    -  element count
704  //
705  //  The two input registers are overwritten.
706  //
707  void  gen_write_ref_array_post_barrier(Register start, Register count) {
708    BarrierSet* bs = Universe::heap()->barrier_set();
709    assert_different_registers(start, count);
710    switch (bs->kind()) {
711      case BarrierSet::G1SATBCTLogging:
712        {
713          __ pusha();                      // push registers
714          __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post),
715                          start, count);
716          __ popa();
717        }
718        break;
719
720      case BarrierSet::CardTableForRS:
721      case BarrierSet::CardTableExtension:
722        {
723          CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
724          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
725
726          Label L_loop;
727          const Register end = count;  // elements count; end == start+count-1
728          assert_different_registers(start, end);
729
730          __ lea(end,  Address(start, count, Address::times_ptr, -wordSize));
731          __ shrptr(start, CardTableModRefBS::card_shift);
732          __ shrptr(end,   CardTableModRefBS::card_shift);
733          __ subptr(end, start); // end --> count
734        __ BIND(L_loop);
735          intptr_t disp = (intptr_t) ct->byte_map_base;
736          Address cardtable(start, count, Address::times_1, disp);
737          __ movb(cardtable, 0);
738          __ decrement(count);
739          __ jcc(Assembler::greaterEqual, L_loop);
740        }
741        break;
742      case BarrierSet::ModRef:
743        break;
744      default      :
745        ShouldNotReachHere();
746
747    }
748  }
749
750
751  // Copy 64 bytes chunks
752  //
753  // Inputs:
754  //   from        - source array address
755  //   to_from     - destination array address - from
756  //   qword_count - 8-bytes element count, negative
757  //
758  void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
759    assert( UseSSE >= 2, "supported cpu only" );
760    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
761    if (UseAVX > 2) {
762      __ push(rbx);
763      __ movl(rbx, 0xffff);
764      __ kmovwl(k1, rbx);
765      __ pop(rbx);
766    }
767    // Copy 64-byte chunks
768    __ jmpb(L_copy_64_bytes);
769    __ align(OptoLoopAlignment);
770  __ BIND(L_copy_64_bytes_loop);
771
772    if (UseUnalignedLoadStores) {
773      if (UseAVX > 2) {
774        __ evmovdqul(xmm0, Address(from, 0), Assembler::AVX_512bit);
775        __ evmovdqul(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
776      } else if (UseAVX == 2) {
777        __ vmovdqu(xmm0, Address(from,  0));
778        __ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
779        __ vmovdqu(xmm1, Address(from, 32));
780        __ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1);
781      } else {
782        __ movdqu(xmm0, Address(from, 0));
783        __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
784        __ movdqu(xmm1, Address(from, 16));
785        __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
786        __ movdqu(xmm2, Address(from, 32));
787        __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
788        __ movdqu(xmm3, Address(from, 48));
789        __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
790      }
791    } else {
792      __ movq(xmm0, Address(from, 0));
793      __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
794      __ movq(xmm1, Address(from, 8));
795      __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
796      __ movq(xmm2, Address(from, 16));
797      __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
798      __ movq(xmm3, Address(from, 24));
799      __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
800      __ movq(xmm4, Address(from, 32));
801      __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
802      __ movq(xmm5, Address(from, 40));
803      __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
804      __ movq(xmm6, Address(from, 48));
805      __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
806      __ movq(xmm7, Address(from, 56));
807      __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
808    }
809
810    __ addl(from, 64);
811  __ BIND(L_copy_64_bytes);
812    __ subl(qword_count, 8);
813    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
814
815    if (UseUnalignedLoadStores && (UseAVX == 2)) {
816      // clean upper bits of YMM registers
817      __ vpxor(xmm0, xmm0);
818      __ vpxor(xmm1, xmm1);
819    }
820    __ addl(qword_count, 8);
821    __ jccb(Assembler::zero, L_exit);
822    //
823    // length is too short, just copy qwords
824    //
825  __ BIND(L_copy_8_bytes);
826    __ movq(xmm0, Address(from, 0));
827    __ movq(Address(from, to_from, Address::times_1), xmm0);
828    __ addl(from, 8);
829    __ decrement(qword_count);
830    __ jcc(Assembler::greater, L_copy_8_bytes);
831  __ BIND(L_exit);
832  }
833
834  // Copy 64 bytes chunks
835  //
836  // Inputs:
837  //   from        - source array address
838  //   to_from     - destination array address - from
839  //   qword_count - 8-bytes element count, negative
840  //
841  void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
842    assert( VM_Version::supports_mmx(), "supported cpu only" );
843    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
844    // Copy 64-byte chunks
845    __ jmpb(L_copy_64_bytes);
846    __ align(OptoLoopAlignment);
847  __ BIND(L_copy_64_bytes_loop);
848    __ movq(mmx0, Address(from, 0));
849    __ movq(mmx1, Address(from, 8));
850    __ movq(mmx2, Address(from, 16));
851    __ movq(Address(from, to_from, Address::times_1, 0), mmx0);
852    __ movq(mmx3, Address(from, 24));
853    __ movq(Address(from, to_from, Address::times_1, 8), mmx1);
854    __ movq(mmx4, Address(from, 32));
855    __ movq(Address(from, to_from, Address::times_1, 16), mmx2);
856    __ movq(mmx5, Address(from, 40));
857    __ movq(Address(from, to_from, Address::times_1, 24), mmx3);
858    __ movq(mmx6, Address(from, 48));
859    __ movq(Address(from, to_from, Address::times_1, 32), mmx4);
860    __ movq(mmx7, Address(from, 56));
861    __ movq(Address(from, to_from, Address::times_1, 40), mmx5);
862    __ movq(Address(from, to_from, Address::times_1, 48), mmx6);
863    __ movq(Address(from, to_from, Address::times_1, 56), mmx7);
864    __ addptr(from, 64);
865  __ BIND(L_copy_64_bytes);
866    __ subl(qword_count, 8);
867    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
868    __ addl(qword_count, 8);
869    __ jccb(Assembler::zero, L_exit);
870    //
871    // length is too short, just copy qwords
872    //
873  __ BIND(L_copy_8_bytes);
874    __ movq(mmx0, Address(from, 0));
875    __ movq(Address(from, to_from, Address::times_1), mmx0);
876    __ addptr(from, 8);
877    __ decrement(qword_count);
878    __ jcc(Assembler::greater, L_copy_8_bytes);
879  __ BIND(L_exit);
880    __ emms();
881  }
882
883  address generate_disjoint_copy(BasicType t, bool aligned,
884                                 Address::ScaleFactor sf,
885                                 address* entry, const char *name,
886                                 bool dest_uninitialized = false) {
887    __ align(CodeEntryAlignment);
888    StubCodeMark mark(this, "StubRoutines", name);
889    address start = __ pc();
890
891    Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
892    Label L_copy_2_bytes, L_copy_4_bytes, L_copy_64_bytes;
893
894    int shift = Address::times_ptr - sf;
895
896    const Register from     = rsi;  // source array address
897    const Register to       = rdi;  // destination array address
898    const Register count    = rcx;  // elements count
899    const Register to_from  = to;   // (to - from)
900    const Register saved_to = rdx;  // saved destination array address
901
902    __ enter(); // required for proper stackwalking of RuntimeStub frame
903    __ push(rsi);
904    __ push(rdi);
905    __ movptr(from , Address(rsp, 12+ 4));
906    __ movptr(to   , Address(rsp, 12+ 8));
907    __ movl(count, Address(rsp, 12+ 12));
908
909    if (entry != NULL) {
910      *entry = __ pc(); // Entry point from conjoint arraycopy stub.
911      BLOCK_COMMENT("Entry:");
912    }
913
914    if (t == T_OBJECT) {
915      __ testl(count, count);
916      __ jcc(Assembler::zero, L_0_count);
917      gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
918      __ mov(saved_to, to);          // save 'to'
919    }
920
921    __ subptr(to, from); // to --> to_from
922    __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
923    __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
924    if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
925      // align source address at 4 bytes address boundary
926      if (t == T_BYTE) {
927        // One byte misalignment happens only for byte arrays
928        __ testl(from, 1);
929        __ jccb(Assembler::zero, L_skip_align1);
930        __ movb(rax, Address(from, 0));
931        __ movb(Address(from, to_from, Address::times_1, 0), rax);
932        __ increment(from);
933        __ decrement(count);
934      __ BIND(L_skip_align1);
935      }
936      // Two bytes misalignment happens only for byte and short (char) arrays
937      __ testl(from, 2);
938      __ jccb(Assembler::zero, L_skip_align2);
939      __ movw(rax, Address(from, 0));
940      __ movw(Address(from, to_from, Address::times_1, 0), rax);
941      __ addptr(from, 2);
942      __ subl(count, 1<<(shift-1));
943    __ BIND(L_skip_align2);
944    }
945    if (!VM_Version::supports_mmx()) {
946      __ mov(rax, count);      // save 'count'
947      __ shrl(count, shift); // bytes count
948      __ addptr(to_from, from);// restore 'to'
949      __ rep_mov();
950      __ subptr(to_from, from);// restore 'to_from'
951      __ mov(count, rax);      // restore 'count'
952      __ jmpb(L_copy_2_bytes); // all dwords were copied
953    } else {
954      if (!UseUnalignedLoadStores) {
955        // align to 8 bytes, we know we are 4 byte aligned to start
956        __ testptr(from, 4);
957        __ jccb(Assembler::zero, L_copy_64_bytes);
958        __ movl(rax, Address(from, 0));
959        __ movl(Address(from, to_from, Address::times_1, 0), rax);
960        __ addptr(from, 4);
961        __ subl(count, 1<<shift);
962      }
963    __ BIND(L_copy_64_bytes);
964      __ mov(rax, count);
965      __ shrl(rax, shift+1);  // 8 bytes chunk count
966      //
967      // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop
968      //
969      if (UseXMMForArrayCopy) {
970        xmm_copy_forward(from, to_from, rax);
971      } else {
972        mmx_copy_forward(from, to_from, rax);
973      }
974    }
975    // copy tailing dword
976  __ BIND(L_copy_4_bytes);
977    __ testl(count, 1<<shift);
978    __ jccb(Assembler::zero, L_copy_2_bytes);
979    __ movl(rax, Address(from, 0));
980    __ movl(Address(from, to_from, Address::times_1, 0), rax);
981    if (t == T_BYTE || t == T_SHORT) {
982      __ addptr(from, 4);
983    __ BIND(L_copy_2_bytes);
984      // copy tailing word
985      __ testl(count, 1<<(shift-1));
986      __ jccb(Assembler::zero, L_copy_byte);
987      __ movw(rax, Address(from, 0));
988      __ movw(Address(from, to_from, Address::times_1, 0), rax);
989      if (t == T_BYTE) {
990        __ addptr(from, 2);
991      __ BIND(L_copy_byte);
992        // copy tailing byte
993        __ testl(count, 1);
994        __ jccb(Assembler::zero, L_exit);
995        __ movb(rax, Address(from, 0));
996        __ movb(Address(from, to_from, Address::times_1, 0), rax);
997      __ BIND(L_exit);
998      } else {
999      __ BIND(L_copy_byte);
1000      }
1001    } else {
1002    __ BIND(L_copy_2_bytes);
1003    }
1004
1005    if (t == T_OBJECT) {
1006      __ movl(count, Address(rsp, 12+12)); // reread 'count'
1007      __ mov(to, saved_to); // restore 'to'
1008      gen_write_ref_array_post_barrier(to, count);
1009    __ BIND(L_0_count);
1010    }
1011    inc_copy_counter_np(t);
1012    __ pop(rdi);
1013    __ pop(rsi);
1014    __ leave(); // required for proper stackwalking of RuntimeStub frame
1015    __ xorptr(rax, rax); // return 0
1016    __ ret(0);
1017    return start;
1018  }
1019
1020
1021  address generate_fill(BasicType t, bool aligned, const char *name) {
1022    __ align(CodeEntryAlignment);
1023    StubCodeMark mark(this, "StubRoutines", name);
1024    address start = __ pc();
1025
1026    BLOCK_COMMENT("Entry:");
1027
1028    const Register to       = rdi;  // source array address
1029    const Register value    = rdx;  // value
1030    const Register count    = rsi;  // elements count
1031
1032    __ enter(); // required for proper stackwalking of RuntimeStub frame
1033    __ push(rsi);
1034    __ push(rdi);
1035    __ movptr(to   , Address(rsp, 12+ 4));
1036    __ movl(value, Address(rsp, 12+ 8));
1037    __ movl(count, Address(rsp, 12+ 12));
1038
1039    __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1040
1041    __ pop(rdi);
1042    __ pop(rsi);
1043    __ leave(); // required for proper stackwalking of RuntimeStub frame
1044    __ ret(0);
1045    return start;
1046  }
1047
1048  address generate_conjoint_copy(BasicType t, bool aligned,
1049                                 Address::ScaleFactor sf,
1050                                 address nooverlap_target,
1051                                 address* entry, const char *name,
1052                                 bool dest_uninitialized = false) {
1053    __ align(CodeEntryAlignment);
1054    StubCodeMark mark(this, "StubRoutines", name);
1055    address start = __ pc();
1056
1057    Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
1058    Label L_copy_2_bytes, L_copy_4_bytes, L_copy_8_bytes, L_copy_8_bytes_loop;
1059
1060    int shift = Address::times_ptr - sf;
1061
1062    const Register src   = rax;  // source array address
1063    const Register dst   = rdx;  // destination array address
1064    const Register from  = rsi;  // source array address
1065    const Register to    = rdi;  // destination array address
1066    const Register count = rcx;  // elements count
1067    const Register end   = rax;  // array end address
1068
1069    __ enter(); // required for proper stackwalking of RuntimeStub frame
1070    __ push(rsi);
1071    __ push(rdi);
1072    __ movptr(src  , Address(rsp, 12+ 4));   // from
1073    __ movptr(dst  , Address(rsp, 12+ 8));   // to
1074    __ movl2ptr(count, Address(rsp, 12+12)); // count
1075
1076    if (entry != NULL) {
1077      *entry = __ pc(); // Entry point from generic arraycopy stub.
1078      BLOCK_COMMENT("Entry:");
1079    }
1080
1081    // nooverlap_target expects arguments in rsi and rdi.
1082    __ mov(from, src);
1083    __ mov(to  , dst);
1084
1085    // arrays overlap test: dispatch to disjoint stub if necessary.
1086    RuntimeAddress nooverlap(nooverlap_target);
1087    __ cmpptr(dst, src);
1088    __ lea(end, Address(src, count, sf, 0)); // src + count * elem_size
1089    __ jump_cc(Assembler::belowEqual, nooverlap);
1090    __ cmpptr(dst, end);
1091    __ jump_cc(Assembler::aboveEqual, nooverlap);
1092
1093    if (t == T_OBJECT) {
1094      __ testl(count, count);
1095      __ jcc(Assembler::zero, L_0_count);
1096      gen_write_ref_array_pre_barrier(dst, count, dest_uninitialized);
1097    }
1098
1099    // copy from high to low
1100    __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1101    __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
1102    if (t == T_BYTE || t == T_SHORT) {
1103      // Align the end of destination array at 4 bytes address boundary
1104      __ lea(end, Address(dst, count, sf, 0));
1105      if (t == T_BYTE) {
1106        // One byte misalignment happens only for byte arrays
1107        __ testl(end, 1);
1108        __ jccb(Assembler::zero, L_skip_align1);
1109        __ decrement(count);
1110        __ movb(rdx, Address(from, count, sf, 0));
1111        __ movb(Address(to, count, sf, 0), rdx);
1112      __ BIND(L_skip_align1);
1113      }
1114      // Two bytes misalignment happens only for byte and short (char) arrays
1115      __ testl(end, 2);
1116      __ jccb(Assembler::zero, L_skip_align2);
1117      __ subptr(count, 1<<(shift-1));
1118      __ movw(rdx, Address(from, count, sf, 0));
1119      __ movw(Address(to, count, sf, 0), rdx);
1120    __ BIND(L_skip_align2);
1121      __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1122      __ jcc(Assembler::below, L_copy_4_bytes);
1123    }
1124
1125    if (!VM_Version::supports_mmx()) {
1126      __ std();
1127      __ mov(rax, count); // Save 'count'
1128      __ mov(rdx, to);    // Save 'to'
1129      __ lea(rsi, Address(from, count, sf, -4));
1130      __ lea(rdi, Address(to  , count, sf, -4));
1131      __ shrptr(count, shift); // bytes count
1132      __ rep_mov();
1133      __ cld();
1134      __ mov(count, rax); // restore 'count'
1135      __ andl(count, (1<<shift)-1);      // mask the number of rest elements
1136      __ movptr(from, Address(rsp, 12+4)); // reread 'from'
1137      __ mov(to, rdx);   // restore 'to'
1138      __ jmpb(L_copy_2_bytes); // all dword were copied
1139   } else {
1140      // Align to 8 bytes the end of array. It is aligned to 4 bytes already.
1141      __ testptr(end, 4);
1142      __ jccb(Assembler::zero, L_copy_8_bytes);
1143      __ subl(count, 1<<shift);
1144      __ movl(rdx, Address(from, count, sf, 0));
1145      __ movl(Address(to, count, sf, 0), rdx);
1146      __ jmpb(L_copy_8_bytes);
1147
1148      __ align(OptoLoopAlignment);
1149      // Move 8 bytes
1150    __ BIND(L_copy_8_bytes_loop);
1151      if (UseXMMForArrayCopy) {
1152        __ movq(xmm0, Address(from, count, sf, 0));
1153        __ movq(Address(to, count, sf, 0), xmm0);
1154      } else {
1155        __ movq(mmx0, Address(from, count, sf, 0));
1156        __ movq(Address(to, count, sf, 0), mmx0);
1157      }
1158    __ BIND(L_copy_8_bytes);
1159      __ subl(count, 2<<shift);
1160      __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1161      __ addl(count, 2<<shift);
1162      if (!UseXMMForArrayCopy) {
1163        __ emms();
1164      }
1165    }
1166  __ BIND(L_copy_4_bytes);
1167    // copy prefix qword
1168    __ testl(count, 1<<shift);
1169    __ jccb(Assembler::zero, L_copy_2_bytes);
1170    __ movl(rdx, Address(from, count, sf, -4));
1171    __ movl(Address(to, count, sf, -4), rdx);
1172
1173    if (t == T_BYTE || t == T_SHORT) {
1174        __ subl(count, (1<<shift));
1175      __ BIND(L_copy_2_bytes);
1176        // copy prefix dword
1177        __ testl(count, 1<<(shift-1));
1178        __ jccb(Assembler::zero, L_copy_byte);
1179        __ movw(rdx, Address(from, count, sf, -2));
1180        __ movw(Address(to, count, sf, -2), rdx);
1181        if (t == T_BYTE) {
1182          __ subl(count, 1<<(shift-1));
1183        __ BIND(L_copy_byte);
1184          // copy prefix byte
1185          __ testl(count, 1);
1186          __ jccb(Assembler::zero, L_exit);
1187          __ movb(rdx, Address(from, 0));
1188          __ movb(Address(to, 0), rdx);
1189        __ BIND(L_exit);
1190        } else {
1191        __ BIND(L_copy_byte);
1192        }
1193    } else {
1194    __ BIND(L_copy_2_bytes);
1195    }
1196    if (t == T_OBJECT) {
1197      __ movl2ptr(count, Address(rsp, 12+12)); // reread count
1198      gen_write_ref_array_post_barrier(to, count);
1199    __ BIND(L_0_count);
1200    }
1201    inc_copy_counter_np(t);
1202    __ pop(rdi);
1203    __ pop(rsi);
1204    __ leave(); // required for proper stackwalking of RuntimeStub frame
1205    __ xorptr(rax, rax); // return 0
1206    __ ret(0);
1207    return start;
1208  }
1209
1210
1211  address generate_disjoint_long_copy(address* entry, const char *name) {
1212    __ align(CodeEntryAlignment);
1213    StubCodeMark mark(this, "StubRoutines", name);
1214    address start = __ pc();
1215
1216    Label L_copy_8_bytes, L_copy_8_bytes_loop;
1217    const Register from       = rax;  // source array address
1218    const Register to         = rdx;  // destination array address
1219    const Register count      = rcx;  // elements count
1220    const Register to_from    = rdx;  // (to - from)
1221
1222    __ enter(); // required for proper stackwalking of RuntimeStub frame
1223    __ movptr(from , Address(rsp, 8+0));       // from
1224    __ movptr(to   , Address(rsp, 8+4));       // to
1225    __ movl2ptr(count, Address(rsp, 8+8));     // count
1226
1227    *entry = __ pc(); // Entry point from conjoint arraycopy stub.
1228    BLOCK_COMMENT("Entry:");
1229
1230    __ subptr(to, from); // to --> to_from
1231    if (VM_Version::supports_mmx()) {
1232      if (UseXMMForArrayCopy) {
1233        xmm_copy_forward(from, to_from, count);
1234      } else {
1235        mmx_copy_forward(from, to_from, count);
1236      }
1237    } else {
1238      __ jmpb(L_copy_8_bytes);
1239      __ align(OptoLoopAlignment);
1240    __ BIND(L_copy_8_bytes_loop);
1241      __ fild_d(Address(from, 0));
1242      __ fistp_d(Address(from, to_from, Address::times_1));
1243      __ addptr(from, 8);
1244    __ BIND(L_copy_8_bytes);
1245      __ decrement(count);
1246      __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1247    }
1248    inc_copy_counter_np(T_LONG);
1249    __ leave(); // required for proper stackwalking of RuntimeStub frame
1250    __ xorptr(rax, rax); // return 0
1251    __ ret(0);
1252    return start;
1253  }
1254
1255  address generate_conjoint_long_copy(address nooverlap_target,
1256                                      address* entry, const char *name) {
1257    __ align(CodeEntryAlignment);
1258    StubCodeMark mark(this, "StubRoutines", name);
1259    address start = __ pc();
1260
1261    Label L_copy_8_bytes, L_copy_8_bytes_loop;
1262    const Register from       = rax;  // source array address
1263    const Register to         = rdx;  // destination array address
1264    const Register count      = rcx;  // elements count
1265    const Register end_from   = rax;  // source array end address
1266
1267    __ enter(); // required for proper stackwalking of RuntimeStub frame
1268    __ movptr(from , Address(rsp, 8+0));       // from
1269    __ movptr(to   , Address(rsp, 8+4));       // to
1270    __ movl2ptr(count, Address(rsp, 8+8));     // count
1271
1272    *entry = __ pc(); // Entry point from generic arraycopy stub.
1273    BLOCK_COMMENT("Entry:");
1274
1275    // arrays overlap test
1276    __ cmpptr(to, from);
1277    RuntimeAddress nooverlap(nooverlap_target);
1278    __ jump_cc(Assembler::belowEqual, nooverlap);
1279    __ lea(end_from, Address(from, count, Address::times_8, 0));
1280    __ cmpptr(to, end_from);
1281    __ movptr(from, Address(rsp, 8));  // from
1282    __ jump_cc(Assembler::aboveEqual, nooverlap);
1283
1284    __ jmpb(L_copy_8_bytes);
1285
1286    __ align(OptoLoopAlignment);
1287  __ BIND(L_copy_8_bytes_loop);
1288    if (VM_Version::supports_mmx()) {
1289      if (UseXMMForArrayCopy) {
1290        __ movq(xmm0, Address(from, count, Address::times_8));
1291        __ movq(Address(to, count, Address::times_8), xmm0);
1292      } else {
1293        __ movq(mmx0, Address(from, count, Address::times_8));
1294        __ movq(Address(to, count, Address::times_8), mmx0);
1295      }
1296    } else {
1297      __ fild_d(Address(from, count, Address::times_8));
1298      __ fistp_d(Address(to, count, Address::times_8));
1299    }
1300  __ BIND(L_copy_8_bytes);
1301    __ decrement(count);
1302    __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1303
1304    if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
1305      __ emms();
1306    }
1307    inc_copy_counter_np(T_LONG);
1308    __ leave(); // required for proper stackwalking of RuntimeStub frame
1309    __ xorptr(rax, rax); // return 0
1310    __ ret(0);
1311    return start;
1312  }
1313
1314
1315  // Helper for generating a dynamic type check.
1316  // The sub_klass must be one of {rbx, rdx, rsi}.
1317  // The temp is killed.
1318  void generate_type_check(Register sub_klass,
1319                           Address& super_check_offset_addr,
1320                           Address& super_klass_addr,
1321                           Register temp,
1322                           Label* L_success, Label* L_failure) {
1323    BLOCK_COMMENT("type_check:");
1324
1325    Label L_fallthrough;
1326#define LOCAL_JCC(assembler_con, label_ptr)                             \
1327    if (label_ptr != NULL)  __ jcc(assembler_con, *(label_ptr));        \
1328    else                    __ jcc(assembler_con, L_fallthrough) /*omit semi*/
1329
1330    // The following is a strange variation of the fast path which requires
1331    // one less register, because needed values are on the argument stack.
1332    // __ check_klass_subtype_fast_path(sub_klass, *super_klass*, temp,
1333    //                                  L_success, L_failure, NULL);
1334    assert_different_registers(sub_klass, temp);
1335
1336    int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1337
1338    // if the pointers are equal, we are done (e.g., String[] elements)
1339    __ cmpptr(sub_klass, super_klass_addr);
1340    LOCAL_JCC(Assembler::equal, L_success);
1341
1342    // check the supertype display:
1343    __ movl2ptr(temp, super_check_offset_addr);
1344    Address super_check_addr(sub_klass, temp, Address::times_1, 0);
1345    __ movptr(temp, super_check_addr); // load displayed supertype
1346    __ cmpptr(temp, super_klass_addr); // test the super type
1347    LOCAL_JCC(Assembler::equal, L_success);
1348
1349    // if it was a primary super, we can just fail immediately
1350    __ cmpl(super_check_offset_addr, sc_offset);
1351    LOCAL_JCC(Assembler::notEqual, L_failure);
1352
1353    // The repne_scan instruction uses fixed registers, which will get spilled.
1354    // We happen to know this works best when super_klass is in rax.
1355    Register super_klass = temp;
1356    __ movptr(super_klass, super_klass_addr);
1357    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg,
1358                                     L_success, L_failure);
1359
1360    __ bind(L_fallthrough);
1361
1362    if (L_success == NULL) { BLOCK_COMMENT("L_success:"); }
1363    if (L_failure == NULL) { BLOCK_COMMENT("L_failure:"); }
1364
1365#undef LOCAL_JCC
1366  }
1367
1368  //
1369  //  Generate checkcasting array copy stub
1370  //
1371  //  Input:
1372  //    4(rsp)   - source array address
1373  //    8(rsp)   - destination array address
1374  //   12(rsp)   - element count, can be zero
1375  //   16(rsp)   - size_t ckoff (super_check_offset)
1376  //   20(rsp)   - oop ckval (super_klass)
1377  //
1378  //  Output:
1379  //    rax, ==  0  -  success
1380  //    rax, == -1^K - failure, where K is partial transfer count
1381  //
1382  address generate_checkcast_copy(const char *name, address* entry, bool dest_uninitialized = false) {
1383    __ align(CodeEntryAlignment);
1384    StubCodeMark mark(this, "StubRoutines", name);
1385    address start = __ pc();
1386
1387    Label L_load_element, L_store_element, L_do_card_marks, L_done;
1388
1389    // register use:
1390    //  rax, rdx, rcx -- loop control (end_from, end_to, count)
1391    //  rdi, rsi      -- element access (oop, klass)
1392    //  rbx,           -- temp
1393    const Register from       = rax;    // source array address
1394    const Register to         = rdx;    // destination array address
1395    const Register length     = rcx;    // elements count
1396    const Register elem       = rdi;    // each oop copied
1397    const Register elem_klass = rsi;    // each elem._klass (sub_klass)
1398    const Register temp       = rbx;    // lone remaining temp
1399
1400    __ enter(); // required for proper stackwalking of RuntimeStub frame
1401
1402    __ push(rsi);
1403    __ push(rdi);
1404    __ push(rbx);
1405
1406    Address   from_arg(rsp, 16+ 4);     // from
1407    Address     to_arg(rsp, 16+ 8);     // to
1408    Address length_arg(rsp, 16+12);     // elements count
1409    Address  ckoff_arg(rsp, 16+16);     // super_check_offset
1410    Address  ckval_arg(rsp, 16+20);     // super_klass
1411
1412    // Load up:
1413    __ movptr(from,     from_arg);
1414    __ movptr(to,         to_arg);
1415    __ movl2ptr(length, length_arg);
1416
1417    if (entry != NULL) {
1418      *entry = __ pc(); // Entry point from generic arraycopy stub.
1419      BLOCK_COMMENT("Entry:");
1420    }
1421
1422    //---------------------------------------------------------------
1423    // Assembler stub will be used for this call to arraycopy
1424    // if the two arrays are subtypes of Object[] but the
1425    // destination array type is not equal to or a supertype
1426    // of the source type.  Each element must be separately
1427    // checked.
1428
1429    // Loop-invariant addresses.  They are exclusive end pointers.
1430    Address end_from_addr(from, length, Address::times_ptr, 0);
1431    Address   end_to_addr(to,   length, Address::times_ptr, 0);
1432
1433    Register end_from = from;           // re-use
1434    Register end_to   = to;             // re-use
1435    Register count    = length;         // re-use
1436
1437    // Loop-variant addresses.  They assume post-incremented count < 0.
1438    Address from_element_addr(end_from, count, Address::times_ptr, 0);
1439    Address   to_element_addr(end_to,   count, Address::times_ptr, 0);
1440    Address elem_klass_addr(elem, oopDesc::klass_offset_in_bytes());
1441
1442    // Copy from low to high addresses, indexed from the end of each array.
1443    gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1444    __ lea(end_from, end_from_addr);
1445    __ lea(end_to,   end_to_addr);
1446    assert(length == count, "");        // else fix next line:
1447    __ negptr(count);                   // negate and test the length
1448    __ jccb(Assembler::notZero, L_load_element);
1449
1450    // Empty array:  Nothing to do.
1451    __ xorptr(rax, rax);                  // return 0 on (trivial) success
1452    __ jmp(L_done);
1453
1454    // ======== begin loop ========
1455    // (Loop is rotated; its entry is L_load_element.)
1456    // Loop control:
1457    //   for (count = -count; count != 0; count++)
1458    // Base pointers src, dst are biased by 8*count,to last element.
1459    __ align(OptoLoopAlignment);
1460
1461    __ BIND(L_store_element);
1462    __ movptr(to_element_addr, elem);     // store the oop
1463    __ increment(count);                // increment the count toward zero
1464    __ jccb(Assembler::zero, L_do_card_marks);
1465
1466    // ======== loop entry is here ========
1467    __ BIND(L_load_element);
1468    __ movptr(elem, from_element_addr);   // load the oop
1469    __ testptr(elem, elem);
1470    __ jccb(Assembler::zero, L_store_element);
1471
1472    // (Could do a trick here:  Remember last successful non-null
1473    // element stored and make a quick oop equality check on it.)
1474
1475    __ movptr(elem_klass, elem_klass_addr); // query the object klass
1476    generate_type_check(elem_klass, ckoff_arg, ckval_arg, temp,
1477                        &L_store_element, NULL);
1478    // (On fall-through, we have failed the element type check.)
1479    // ======== end loop ========
1480
1481    // It was a real error; we must depend on the caller to finish the job.
1482    // Register "count" = -1 * number of *remaining* oops, length_arg = *total* oops.
1483    // Emit GC store barriers for the oops we have copied (length_arg + count),
1484    // and report their number to the caller.
1485    assert_different_registers(to, count, rax);
1486    Label L_post_barrier;
1487    __ addl(count, length_arg);         // transfers = (length - remaining)
1488    __ movl2ptr(rax, count);            // save the value
1489    __ notptr(rax);                     // report (-1^K) to caller (does not affect flags)
1490    __ jccb(Assembler::notZero, L_post_barrier);
1491    __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
1492
1493    // Come here on success only.
1494    __ BIND(L_do_card_marks);
1495    __ xorptr(rax, rax);                // return 0 on success
1496    __ movl2ptr(count, length_arg);
1497
1498    __ BIND(L_post_barrier);
1499    __ movptr(to, to_arg);              // reload
1500    gen_write_ref_array_post_barrier(to, count);
1501
1502    // Common exit point (success or failure).
1503    __ BIND(L_done);
1504    __ pop(rbx);
1505    __ pop(rdi);
1506    __ pop(rsi);
1507    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1508    __ leave(); // required for proper stackwalking of RuntimeStub frame
1509    __ ret(0);
1510
1511    return start;
1512  }
1513
1514  //
1515  //  Generate 'unsafe' array copy stub
1516  //  Though just as safe as the other stubs, it takes an unscaled
1517  //  size_t argument instead of an element count.
1518  //
1519  //  Input:
1520  //    4(rsp)   - source array address
1521  //    8(rsp)   - destination array address
1522  //   12(rsp)   - byte count, can be zero
1523  //
1524  //  Output:
1525  //    rax, ==  0  -  success
1526  //    rax, == -1  -  need to call System.arraycopy
1527  //
1528  // Examines the alignment of the operands and dispatches
1529  // to a long, int, short, or byte copy loop.
1530  //
1531  address generate_unsafe_copy(const char *name,
1532                               address byte_copy_entry,
1533                               address short_copy_entry,
1534                               address int_copy_entry,
1535                               address long_copy_entry) {
1536
1537    Label L_long_aligned, L_int_aligned, L_short_aligned;
1538
1539    __ align(CodeEntryAlignment);
1540    StubCodeMark mark(this, "StubRoutines", name);
1541    address start = __ pc();
1542
1543    const Register from       = rax;  // source array address
1544    const Register to         = rdx;  // destination array address
1545    const Register count      = rcx;  // elements count
1546
1547    __ enter(); // required for proper stackwalking of RuntimeStub frame
1548    __ push(rsi);
1549    __ push(rdi);
1550    Address  from_arg(rsp, 12+ 4);      // from
1551    Address    to_arg(rsp, 12+ 8);      // to
1552    Address count_arg(rsp, 12+12);      // byte count
1553
1554    // Load up:
1555    __ movptr(from ,  from_arg);
1556    __ movptr(to   ,    to_arg);
1557    __ movl2ptr(count, count_arg);
1558
1559    // bump this on entry, not on exit:
1560    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1561
1562    const Register bits = rsi;
1563    __ mov(bits, from);
1564    __ orptr(bits, to);
1565    __ orptr(bits, count);
1566
1567    __ testl(bits, BytesPerLong-1);
1568    __ jccb(Assembler::zero, L_long_aligned);
1569
1570    __ testl(bits, BytesPerInt-1);
1571    __ jccb(Assembler::zero, L_int_aligned);
1572
1573    __ testl(bits, BytesPerShort-1);
1574    __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
1575
1576    __ BIND(L_short_aligned);
1577    __ shrptr(count, LogBytesPerShort); // size => short_count
1578    __ movl(count_arg, count);          // update 'count'
1579    __ jump(RuntimeAddress(short_copy_entry));
1580
1581    __ BIND(L_int_aligned);
1582    __ shrptr(count, LogBytesPerInt); // size => int_count
1583    __ movl(count_arg, count);          // update 'count'
1584    __ jump(RuntimeAddress(int_copy_entry));
1585
1586    __ BIND(L_long_aligned);
1587    __ shrptr(count, LogBytesPerLong); // size => qword_count
1588    __ movl(count_arg, count);          // update 'count'
1589    __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1590    __ pop(rsi);
1591    __ jump(RuntimeAddress(long_copy_entry));
1592
1593    return start;
1594  }
1595
1596
1597  // Perform range checks on the proposed arraycopy.
1598  // Smashes src_pos and dst_pos.  (Uses them up for temps.)
1599  void arraycopy_range_checks(Register src,
1600                              Register src_pos,
1601                              Register dst,
1602                              Register dst_pos,
1603                              Address& length,
1604                              Label& L_failed) {
1605    BLOCK_COMMENT("arraycopy_range_checks:");
1606    const Register src_end = src_pos;   // source array end position
1607    const Register dst_end = dst_pos;   // destination array end position
1608    __ addl(src_end, length); // src_pos + length
1609    __ addl(dst_end, length); // dst_pos + length
1610
1611    //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
1612    __ cmpl(src_end, Address(src, arrayOopDesc::length_offset_in_bytes()));
1613    __ jcc(Assembler::above, L_failed);
1614
1615    //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
1616    __ cmpl(dst_end, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1617    __ jcc(Assembler::above, L_failed);
1618
1619    BLOCK_COMMENT("arraycopy_range_checks done");
1620  }
1621
1622
1623  //
1624  //  Generate generic array copy stubs
1625  //
1626  //  Input:
1627  //     4(rsp)    -  src oop
1628  //     8(rsp)    -  src_pos
1629  //    12(rsp)    -  dst oop
1630  //    16(rsp)    -  dst_pos
1631  //    20(rsp)    -  element count
1632  //
1633  //  Output:
1634  //    rax, ==  0  -  success
1635  //    rax, == -1^K - failure, where K is partial transfer count
1636  //
1637  address generate_generic_copy(const char *name,
1638                                address entry_jbyte_arraycopy,
1639                                address entry_jshort_arraycopy,
1640                                address entry_jint_arraycopy,
1641                                address entry_oop_arraycopy,
1642                                address entry_jlong_arraycopy,
1643                                address entry_checkcast_arraycopy) {
1644    Label L_failed, L_failed_0, L_objArray;
1645
1646    { int modulus = CodeEntryAlignment;
1647      int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
1648      int advance = target - (__ offset() % modulus);
1649      if (advance < 0)  advance += modulus;
1650      if (advance > 0)  __ nop(advance);
1651    }
1652    StubCodeMark mark(this, "StubRoutines", name);
1653
1654    // Short-hop target to L_failed.  Makes for denser prologue code.
1655    __ BIND(L_failed_0);
1656    __ jmp(L_failed);
1657    assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
1658
1659    __ align(CodeEntryAlignment);
1660    address start = __ pc();
1661
1662    __ enter(); // required for proper stackwalking of RuntimeStub frame
1663    __ push(rsi);
1664    __ push(rdi);
1665
1666    // bump this on entry, not on exit:
1667    inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1668
1669    // Input values
1670    Address SRC     (rsp, 12+ 4);
1671    Address SRC_POS (rsp, 12+ 8);
1672    Address DST     (rsp, 12+12);
1673    Address DST_POS (rsp, 12+16);
1674    Address LENGTH  (rsp, 12+20);
1675
1676    //-----------------------------------------------------------------------
1677    // Assembler stub will be used for this call to arraycopy
1678    // if the following conditions are met:
1679    //
1680    // (1) src and dst must not be null.
1681    // (2) src_pos must not be negative.
1682    // (3) dst_pos must not be negative.
1683    // (4) length  must not be negative.
1684    // (5) src klass and dst klass should be the same and not NULL.
1685    // (6) src and dst should be arrays.
1686    // (7) src_pos + length must not exceed length of src.
1687    // (8) dst_pos + length must not exceed length of dst.
1688    //
1689
1690    const Register src     = rax;       // source array oop
1691    const Register src_pos = rsi;
1692    const Register dst     = rdx;       // destination array oop
1693    const Register dst_pos = rdi;
1694    const Register length  = rcx;       // transfer count
1695
1696    //  if (src == NULL) return -1;
1697    __ movptr(src, SRC);      // src oop
1698    __ testptr(src, src);
1699    __ jccb(Assembler::zero, L_failed_0);
1700
1701    //  if (src_pos < 0) return -1;
1702    __ movl2ptr(src_pos, SRC_POS);  // src_pos
1703    __ testl(src_pos, src_pos);
1704    __ jccb(Assembler::negative, L_failed_0);
1705
1706    //  if (dst == NULL) return -1;
1707    __ movptr(dst, DST);      // dst oop
1708    __ testptr(dst, dst);
1709    __ jccb(Assembler::zero, L_failed_0);
1710
1711    //  if (dst_pos < 0) return -1;
1712    __ movl2ptr(dst_pos, DST_POS);  // dst_pos
1713    __ testl(dst_pos, dst_pos);
1714    __ jccb(Assembler::negative, L_failed_0);
1715
1716    //  if (length < 0) return -1;
1717    __ movl2ptr(length, LENGTH);   // length
1718    __ testl(length, length);
1719    __ jccb(Assembler::negative, L_failed_0);
1720
1721    //  if (src->klass() == NULL) return -1;
1722    Address src_klass_addr(src, oopDesc::klass_offset_in_bytes());
1723    Address dst_klass_addr(dst, oopDesc::klass_offset_in_bytes());
1724    const Register rcx_src_klass = rcx;    // array klass
1725    __ movptr(rcx_src_klass, Address(src, oopDesc::klass_offset_in_bytes()));
1726
1727#ifdef ASSERT
1728    //  assert(src->klass() != NULL);
1729    BLOCK_COMMENT("assert klasses not null");
1730    { Label L1, L2;
1731      __ testptr(rcx_src_klass, rcx_src_klass);
1732      __ jccb(Assembler::notZero, L2);   // it is broken if klass is NULL
1733      __ bind(L1);
1734      __ stop("broken null klass");
1735      __ bind(L2);
1736      __ cmpptr(dst_klass_addr, (int32_t)NULL_WORD);
1737      __ jccb(Assembler::equal, L1);      // this would be broken also
1738      BLOCK_COMMENT("assert done");
1739    }
1740#endif //ASSERT
1741
1742    // Load layout helper (32-bits)
1743    //
1744    //  |array_tag|     | header_size | element_type |     |log2_element_size|
1745    // 32        30    24            16              8     2                 0
1746    //
1747    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1748    //
1749
1750    int lh_offset = in_bytes(Klass::layout_helper_offset());
1751    Address src_klass_lh_addr(rcx_src_klass, lh_offset);
1752
1753    // Handle objArrays completely differently...
1754    jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1755    __ cmpl(src_klass_lh_addr, objArray_lh);
1756    __ jcc(Assembler::equal, L_objArray);
1757
1758    //  if (src->klass() != dst->klass()) return -1;
1759    __ cmpptr(rcx_src_klass, dst_klass_addr);
1760    __ jccb(Assembler::notEqual, L_failed_0);
1761
1762    const Register rcx_lh = rcx;  // layout helper
1763    assert(rcx_lh == rcx_src_klass, "known alias");
1764    __ movl(rcx_lh, src_klass_lh_addr);
1765
1766    //  if (!src->is_Array()) return -1;
1767    __ cmpl(rcx_lh, Klass::_lh_neutral_value);
1768    __ jcc(Assembler::greaterEqual, L_failed_0); // signed cmp
1769
1770    // At this point, it is known to be a typeArray (array_tag 0x3).
1771#ifdef ASSERT
1772    { Label L;
1773      __ cmpl(rcx_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1774      __ jcc(Assembler::greaterEqual, L); // signed cmp
1775      __ stop("must be a primitive array");
1776      __ bind(L);
1777    }
1778#endif
1779
1780    assert_different_registers(src, src_pos, dst, dst_pos, rcx_lh);
1781    arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1782
1783    // TypeArrayKlass
1784    //
1785    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
1786    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
1787    //
1788    const Register rsi_offset = rsi; // array offset
1789    const Register src_array  = src; // src array offset
1790    const Register dst_array  = dst; // dst array offset
1791    const Register rdi_elsize = rdi; // log2 element size
1792
1793    __ mov(rsi_offset, rcx_lh);
1794    __ shrptr(rsi_offset, Klass::_lh_header_size_shift);
1795    __ andptr(rsi_offset, Klass::_lh_header_size_mask);   // array_offset
1796    __ addptr(src_array, rsi_offset);  // src array offset
1797    __ addptr(dst_array, rsi_offset);  // dst array offset
1798    __ andptr(rcx_lh, Klass::_lh_log2_element_size_mask); // log2 elsize
1799
1800    // next registers should be set before the jump to corresponding stub
1801    const Register from       = src; // source array address
1802    const Register to         = dst; // destination array address
1803    const Register count      = rcx; // elements count
1804    // some of them should be duplicated on stack
1805#define FROM   Address(rsp, 12+ 4)
1806#define TO     Address(rsp, 12+ 8)   // Not used now
1807#define COUNT  Address(rsp, 12+12)   // Only for oop arraycopy
1808
1809    BLOCK_COMMENT("scale indexes to element size");
1810    __ movl2ptr(rsi, SRC_POS);  // src_pos
1811    __ shlptr(rsi);             // src_pos << rcx (log2 elsize)
1812    assert(src_array == from, "");
1813    __ addptr(from, rsi);       // from = src_array + SRC_POS << log2 elsize
1814    __ movl2ptr(rdi, DST_POS);  // dst_pos
1815    __ shlptr(rdi);             // dst_pos << rcx (log2 elsize)
1816    assert(dst_array == to, "");
1817    __ addptr(to,  rdi);        // to   = dst_array + DST_POS << log2 elsize
1818    __ movptr(FROM, from);      // src_addr
1819    __ mov(rdi_elsize, rcx_lh); // log2 elsize
1820    __ movl2ptr(count, LENGTH); // elements count
1821
1822    BLOCK_COMMENT("choose copy loop based on element size");
1823    __ cmpl(rdi_elsize, 0);
1824
1825    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jbyte_arraycopy));
1826    __ cmpl(rdi_elsize, LogBytesPerShort);
1827    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jshort_arraycopy));
1828    __ cmpl(rdi_elsize, LogBytesPerInt);
1829    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jint_arraycopy));
1830#ifdef ASSERT
1831    __ cmpl(rdi_elsize, LogBytesPerLong);
1832    __ jccb(Assembler::notEqual, L_failed);
1833#endif
1834    __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1835    __ pop(rsi);
1836    __ jump(RuntimeAddress(entry_jlong_arraycopy));
1837
1838  __ BIND(L_failed);
1839    __ xorptr(rax, rax);
1840    __ notptr(rax); // return -1
1841    __ pop(rdi);
1842    __ pop(rsi);
1843    __ leave(); // required for proper stackwalking of RuntimeStub frame
1844    __ ret(0);
1845
1846    // ObjArrayKlass
1847  __ BIND(L_objArray);
1848    // live at this point:  rcx_src_klass, src[_pos], dst[_pos]
1849
1850    Label L_plain_copy, L_checkcast_copy;
1851    //  test array classes for subtyping
1852    __ cmpptr(rcx_src_klass, dst_klass_addr); // usual case is exact equality
1853    __ jccb(Assembler::notEqual, L_checkcast_copy);
1854
1855    // Identically typed arrays can be copied without element-wise checks.
1856    assert_different_registers(src, src_pos, dst, dst_pos, rcx_src_klass);
1857    arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1858
1859  __ BIND(L_plain_copy);
1860    __ movl2ptr(count, LENGTH); // elements count
1861    __ movl2ptr(src_pos, SRC_POS);  // reload src_pos
1862    __ lea(from, Address(src, src_pos, Address::times_ptr,
1863                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
1864    __ movl2ptr(dst_pos, DST_POS);  // reload dst_pos
1865    __ lea(to,   Address(dst, dst_pos, Address::times_ptr,
1866                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
1867    __ movptr(FROM,  from);   // src_addr
1868    __ movptr(TO,    to);     // dst_addr
1869    __ movl(COUNT, count);  // count
1870    __ jump(RuntimeAddress(entry_oop_arraycopy));
1871
1872  __ BIND(L_checkcast_copy);
1873    // live at this point:  rcx_src_klass, dst[_pos], src[_pos]
1874    {
1875      // Handy offsets:
1876      int  ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1877      int sco_offset = in_bytes(Klass::super_check_offset_offset());
1878
1879      Register rsi_dst_klass = rsi;
1880      Register rdi_temp      = rdi;
1881      assert(rsi_dst_klass == src_pos, "expected alias w/ src_pos");
1882      assert(rdi_temp      == dst_pos, "expected alias w/ dst_pos");
1883      Address dst_klass_lh_addr(rsi_dst_klass, lh_offset);
1884
1885      // Before looking at dst.length, make sure dst is also an objArray.
1886      __ movptr(rsi_dst_klass, dst_klass_addr);
1887      __ cmpl(dst_klass_lh_addr, objArray_lh);
1888      __ jccb(Assembler::notEqual, L_failed);
1889
1890      // It is safe to examine both src.length and dst.length.
1891      __ movl2ptr(src_pos, SRC_POS);        // reload rsi
1892      arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1893      // (Now src_pos and dst_pos are killed, but not src and dst.)
1894
1895      // We'll need this temp (don't forget to pop it after the type check).
1896      __ push(rbx);
1897      Register rbx_src_klass = rbx;
1898
1899      __ mov(rbx_src_klass, rcx_src_klass); // spill away from rcx
1900      __ movptr(rsi_dst_klass, dst_klass_addr);
1901      Address super_check_offset_addr(rsi_dst_klass, sco_offset);
1902      Label L_fail_array_check;
1903      generate_type_check(rbx_src_klass,
1904                          super_check_offset_addr, dst_klass_addr,
1905                          rdi_temp, NULL, &L_fail_array_check);
1906      // (On fall-through, we have passed the array type check.)
1907      __ pop(rbx);
1908      __ jmp(L_plain_copy);
1909
1910      __ BIND(L_fail_array_check);
1911      // Reshuffle arguments so we can call checkcast_arraycopy:
1912
1913      // match initial saves for checkcast_arraycopy
1914      // push(rsi);    // already done; see above
1915      // push(rdi);    // already done; see above
1916      // push(rbx);    // already done; see above
1917
1918      // Marshal outgoing arguments now, freeing registers.
1919      Address   from_arg(rsp, 16+ 4);   // from
1920      Address     to_arg(rsp, 16+ 8);   // to
1921      Address length_arg(rsp, 16+12);   // elements count
1922      Address  ckoff_arg(rsp, 16+16);   // super_check_offset
1923      Address  ckval_arg(rsp, 16+20);   // super_klass
1924
1925      Address SRC_POS_arg(rsp, 16+ 8);
1926      Address DST_POS_arg(rsp, 16+16);
1927      Address  LENGTH_arg(rsp, 16+20);
1928      // push rbx, changed the incoming offsets (why not just use rbp,??)
1929      // assert(SRC_POS_arg.disp() == SRC_POS.disp() + 4, "");
1930
1931      __ movptr(rbx, Address(rsi_dst_klass, ek_offset));
1932      __ movl2ptr(length, LENGTH_arg);    // reload elements count
1933      __ movl2ptr(src_pos, SRC_POS_arg);  // reload src_pos
1934      __ movl2ptr(dst_pos, DST_POS_arg);  // reload dst_pos
1935
1936      __ movptr(ckval_arg, rbx);          // destination element type
1937      __ movl(rbx, Address(rbx, sco_offset));
1938      __ movl(ckoff_arg, rbx);          // corresponding class check offset
1939
1940      __ movl(length_arg, length);      // outgoing length argument
1941
1942      __ lea(from, Address(src, src_pos, Address::times_ptr,
1943                            arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
1944      __ movptr(from_arg, from);
1945
1946      __ lea(to, Address(dst, dst_pos, Address::times_ptr,
1947                          arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
1948      __ movptr(to_arg, to);
1949      __ jump(RuntimeAddress(entry_checkcast_arraycopy));
1950    }
1951
1952    return start;
1953  }
1954
1955  void generate_arraycopy_stubs() {
1956    address entry;
1957    address entry_jbyte_arraycopy;
1958    address entry_jshort_arraycopy;
1959    address entry_jint_arraycopy;
1960    address entry_oop_arraycopy;
1961    address entry_jlong_arraycopy;
1962    address entry_checkcast_arraycopy;
1963
1964    StubRoutines::_arrayof_jbyte_disjoint_arraycopy =
1965        generate_disjoint_copy(T_BYTE,  true, Address::times_1, &entry,
1966                               "arrayof_jbyte_disjoint_arraycopy");
1967    StubRoutines::_arrayof_jbyte_arraycopy =
1968        generate_conjoint_copy(T_BYTE,  true, Address::times_1,  entry,
1969                               NULL, "arrayof_jbyte_arraycopy");
1970    StubRoutines::_jbyte_disjoint_arraycopy =
1971        generate_disjoint_copy(T_BYTE, false, Address::times_1, &entry,
1972                               "jbyte_disjoint_arraycopy");
1973    StubRoutines::_jbyte_arraycopy =
1974        generate_conjoint_copy(T_BYTE, false, Address::times_1,  entry,
1975                               &entry_jbyte_arraycopy, "jbyte_arraycopy");
1976
1977    StubRoutines::_arrayof_jshort_disjoint_arraycopy =
1978        generate_disjoint_copy(T_SHORT,  true, Address::times_2, &entry,
1979                               "arrayof_jshort_disjoint_arraycopy");
1980    StubRoutines::_arrayof_jshort_arraycopy =
1981        generate_conjoint_copy(T_SHORT,  true, Address::times_2,  entry,
1982                               NULL, "arrayof_jshort_arraycopy");
1983    StubRoutines::_jshort_disjoint_arraycopy =
1984        generate_disjoint_copy(T_SHORT, false, Address::times_2, &entry,
1985                               "jshort_disjoint_arraycopy");
1986    StubRoutines::_jshort_arraycopy =
1987        generate_conjoint_copy(T_SHORT, false, Address::times_2,  entry,
1988                               &entry_jshort_arraycopy, "jshort_arraycopy");
1989
1990    // Next arrays are always aligned on 4 bytes at least.
1991    StubRoutines::_jint_disjoint_arraycopy =
1992        generate_disjoint_copy(T_INT, true, Address::times_4, &entry,
1993                               "jint_disjoint_arraycopy");
1994    StubRoutines::_jint_arraycopy =
1995        generate_conjoint_copy(T_INT, true, Address::times_4,  entry,
1996                               &entry_jint_arraycopy, "jint_arraycopy");
1997
1998    StubRoutines::_oop_disjoint_arraycopy =
1999        generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2000                               "oop_disjoint_arraycopy");
2001    StubRoutines::_oop_arraycopy =
2002        generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
2003                               &entry_oop_arraycopy, "oop_arraycopy");
2004
2005    StubRoutines::_oop_disjoint_arraycopy_uninit =
2006        generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2007                               "oop_disjoint_arraycopy_uninit",
2008                               /*dest_uninitialized*/true);
2009    StubRoutines::_oop_arraycopy_uninit =
2010        generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
2011                               NULL, "oop_arraycopy_uninit",
2012                               /*dest_uninitialized*/true);
2013
2014    StubRoutines::_jlong_disjoint_arraycopy =
2015        generate_disjoint_long_copy(&entry, "jlong_disjoint_arraycopy");
2016    StubRoutines::_jlong_arraycopy =
2017        generate_conjoint_long_copy(entry, &entry_jlong_arraycopy,
2018                                    "jlong_arraycopy");
2019
2020    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2021    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2022    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2023    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2024    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2025    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2026
2027    StubRoutines::_arrayof_jint_disjoint_arraycopy       = StubRoutines::_jint_disjoint_arraycopy;
2028    StubRoutines::_arrayof_oop_disjoint_arraycopy        = StubRoutines::_oop_disjoint_arraycopy;
2029    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
2030    StubRoutines::_arrayof_jlong_disjoint_arraycopy      = StubRoutines::_jlong_disjoint_arraycopy;
2031
2032    StubRoutines::_arrayof_jint_arraycopy       = StubRoutines::_jint_arraycopy;
2033    StubRoutines::_arrayof_oop_arraycopy        = StubRoutines::_oop_arraycopy;
2034    StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
2035    StubRoutines::_arrayof_jlong_arraycopy      = StubRoutines::_jlong_arraycopy;
2036
2037    StubRoutines::_checkcast_arraycopy =
2038        generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2039    StubRoutines::_checkcast_arraycopy_uninit =
2040        generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, /*dest_uninitialized*/true);
2041
2042    StubRoutines::_unsafe_arraycopy =
2043        generate_unsafe_copy("unsafe_arraycopy",
2044                               entry_jbyte_arraycopy,
2045                               entry_jshort_arraycopy,
2046                               entry_jint_arraycopy,
2047                               entry_jlong_arraycopy);
2048
2049    StubRoutines::_generic_arraycopy =
2050        generate_generic_copy("generic_arraycopy",
2051                               entry_jbyte_arraycopy,
2052                               entry_jshort_arraycopy,
2053                               entry_jint_arraycopy,
2054                               entry_oop_arraycopy,
2055                               entry_jlong_arraycopy,
2056                               entry_checkcast_arraycopy);
2057  }
2058
2059  // AES intrinsic stubs
2060  enum {AESBlockSize = 16};
2061
2062  address generate_key_shuffle_mask() {
2063    __ align(16);
2064    StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2065    address start = __ pc();
2066    __ emit_data(0x00010203, relocInfo::none, 0 );
2067    __ emit_data(0x04050607, relocInfo::none, 0 );
2068    __ emit_data(0x08090a0b, relocInfo::none, 0 );
2069    __ emit_data(0x0c0d0e0f, relocInfo::none, 0 );
2070    return start;
2071  }
2072
2073  address generate_counter_shuffle_mask() {
2074    __ align(16);
2075    StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
2076    address start = __ pc();
2077    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
2078    __ emit_data(0x08090a0b, relocInfo::none, 0);
2079    __ emit_data(0x04050607, relocInfo::none, 0);
2080    __ emit_data(0x00010203, relocInfo::none, 0);
2081    return start;
2082  }
2083
2084  // Utility routine for loading a 128-bit key word in little endian format
2085  // can optionally specify that the shuffle mask is already in an xmmregister
2086  void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2087    __ movdqu(xmmdst, Address(key, offset));
2088    if (xmm_shuf_mask != NULL) {
2089      __ pshufb(xmmdst, xmm_shuf_mask);
2090    } else {
2091      __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2092    }
2093  }
2094
2095  // aesenc using specified key+offset
2096  // can optionally specify that the shuffle mask is already in an xmmregister
2097  void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2098    load_key(xmmtmp, key, offset, xmm_shuf_mask);
2099    __ aesenc(xmmdst, xmmtmp);
2100  }
2101
2102  // aesdec using specified key+offset
2103  // can optionally specify that the shuffle mask is already in an xmmregister
2104  void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2105    load_key(xmmtmp, key, offset, xmm_shuf_mask);
2106    __ aesdec(xmmdst, xmmtmp);
2107  }
2108
2109  // Utility routine for increase 128bit counter (iv in CTR mode)
2110  //  XMM_128bit,  D3, D2, D1, D0
2111  void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
2112    __ pextrd(reg, xmmdst, 0x0);
2113    __ addl(reg, inc_delta);
2114    __ pinsrd(xmmdst, reg, 0x0);
2115    __ jcc(Assembler::carryClear, next_block); // jump if no carry
2116
2117    __ pextrd(reg, xmmdst, 0x01); // Carry-> D1
2118    __ addl(reg, 0x01);
2119    __ pinsrd(xmmdst, reg, 0x01);
2120    __ jcc(Assembler::carryClear, next_block); // jump if no carry
2121
2122    __ pextrd(reg, xmmdst, 0x02); // Carry-> D2
2123    __ addl(reg, 0x01);
2124    __ pinsrd(xmmdst, reg, 0x02);
2125    __ jcc(Assembler::carryClear, next_block); // jump if no carry
2126
2127    __ pextrd(reg, xmmdst, 0x03); // Carry -> D3
2128    __ addl(reg, 0x01);
2129    __ pinsrd(xmmdst, reg, 0x03);
2130
2131    __ BIND(next_block);          // next instruction
2132  }
2133
2134
2135  // Arguments:
2136  //
2137  // Inputs:
2138  //   c_rarg0   - source byte array address
2139  //   c_rarg1   - destination byte array address
2140  //   c_rarg2   - K (key) in little endian int array
2141  //
2142  address generate_aescrypt_encryptBlock() {
2143    assert(UseAES, "need AES instructions and misaligned SSE support");
2144    __ align(CodeEntryAlignment);
2145    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2146    Label L_doLast;
2147    address start = __ pc();
2148
2149    const Register from        = rdx;      // source array address
2150    const Register to          = rdx;      // destination array address
2151    const Register key         = rcx;      // key array address
2152    const Register keylen      = rax;
2153    const Address  from_param(rbp, 8+0);
2154    const Address  to_param  (rbp, 8+4);
2155    const Address  key_param (rbp, 8+8);
2156
2157    const XMMRegister xmm_result = xmm0;
2158    const XMMRegister xmm_key_shuf_mask = xmm1;
2159    const XMMRegister xmm_temp1  = xmm2;
2160    const XMMRegister xmm_temp2  = xmm3;
2161    const XMMRegister xmm_temp3  = xmm4;
2162    const XMMRegister xmm_temp4  = xmm5;
2163
2164    __ enter();   // required for proper stackwalking of RuntimeStub frame
2165
2166    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
2167    // context for the registers used, where all instructions below are using 128-bit mode
2168    // On EVEX without VL and BW, these instructions will all be AVX.
2169    if (VM_Version::supports_avx512vlbw()) {
2170      __ movl(rdx, 0xffff);
2171      __ kmovdl(k1, rdx);
2172    }
2173
2174    __ movptr(from, from_param);
2175    __ movptr(key, key_param);
2176
2177    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2178    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2179
2180    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2181    __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
2182    __ movptr(to, to_param);
2183
2184    // For encryption, the java expanded key ordering is just what we need
2185
2186    load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
2187    __ pxor(xmm_result, xmm_temp1);
2188
2189    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2190    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2191    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2192    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2193
2194    __ aesenc(xmm_result, xmm_temp1);
2195    __ aesenc(xmm_result, xmm_temp2);
2196    __ aesenc(xmm_result, xmm_temp3);
2197    __ aesenc(xmm_result, xmm_temp4);
2198
2199    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2200    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2201    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2202    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2203
2204    __ aesenc(xmm_result, xmm_temp1);
2205    __ aesenc(xmm_result, xmm_temp2);
2206    __ aesenc(xmm_result, xmm_temp3);
2207    __ aesenc(xmm_result, xmm_temp4);
2208
2209    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2210    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2211
2212    __ cmpl(keylen, 44);
2213    __ jccb(Assembler::equal, L_doLast);
2214
2215    __ aesenc(xmm_result, xmm_temp1);
2216    __ aesenc(xmm_result, xmm_temp2);
2217
2218    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2219    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2220
2221    __ cmpl(keylen, 52);
2222    __ jccb(Assembler::equal, L_doLast);
2223
2224    __ aesenc(xmm_result, xmm_temp1);
2225    __ aesenc(xmm_result, xmm_temp2);
2226
2227    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2228    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2229
2230    __ BIND(L_doLast);
2231    __ aesenc(xmm_result, xmm_temp1);
2232    __ aesenclast(xmm_result, xmm_temp2);
2233    __ movdqu(Address(to, 0), xmm_result);        // store the result
2234    __ xorptr(rax, rax); // return 0
2235    __ leave(); // required for proper stackwalking of RuntimeStub frame
2236    __ ret(0);
2237
2238    return start;
2239  }
2240
2241
2242  // Arguments:
2243  //
2244  // Inputs:
2245  //   c_rarg0   - source byte array address
2246  //   c_rarg1   - destination byte array address
2247  //   c_rarg2   - K (key) in little endian int array
2248  //
2249  address generate_aescrypt_decryptBlock() {
2250    assert(UseAES, "need AES instructions and misaligned SSE support");
2251    __ align(CodeEntryAlignment);
2252    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2253    Label L_doLast;
2254    address start = __ pc();
2255
2256    const Register from        = rdx;      // source array address
2257    const Register to          = rdx;      // destination array address
2258    const Register key         = rcx;      // key array address
2259    const Register keylen      = rax;
2260    const Address  from_param(rbp, 8+0);
2261    const Address  to_param  (rbp, 8+4);
2262    const Address  key_param (rbp, 8+8);
2263
2264    const XMMRegister xmm_result = xmm0;
2265    const XMMRegister xmm_key_shuf_mask = xmm1;
2266    const XMMRegister xmm_temp1  = xmm2;
2267    const XMMRegister xmm_temp2  = xmm3;
2268    const XMMRegister xmm_temp3  = xmm4;
2269    const XMMRegister xmm_temp4  = xmm5;
2270
2271    __ enter(); // required for proper stackwalking of RuntimeStub frame
2272
2273    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
2274    // context for the registers used, where all instructions below are using 128-bit mode
2275    // On EVEX without VL and BW, these instructions will all be AVX.
2276    if (VM_Version::supports_avx512vlbw()) {
2277      __ movl(rdx, 0xffff);
2278      __ kmovdl(k1, rdx);
2279    }
2280
2281    __ movptr(from, from_param);
2282    __ movptr(key, key_param);
2283
2284    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2285    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2286
2287    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2288    __ movdqu(xmm_result, Address(from, 0));
2289    __ movptr(to, to_param);
2290
2291    // for decryption java expanded key ordering is rotated one position from what we want
2292    // so we start from 0x10 here and hit 0x00 last
2293    // we don't know if the key is aligned, hence not using load-execute form
2294    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2295    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2296    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2297    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2298
2299    __ pxor  (xmm_result, xmm_temp1);
2300    __ aesdec(xmm_result, xmm_temp2);
2301    __ aesdec(xmm_result, xmm_temp3);
2302    __ aesdec(xmm_result, xmm_temp4);
2303
2304    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2305    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2306    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2307    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2308
2309    __ aesdec(xmm_result, xmm_temp1);
2310    __ aesdec(xmm_result, xmm_temp2);
2311    __ aesdec(xmm_result, xmm_temp3);
2312    __ aesdec(xmm_result, xmm_temp4);
2313
2314    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2315    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2316    load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
2317
2318    __ cmpl(keylen, 44);
2319    __ jccb(Assembler::equal, L_doLast);
2320
2321    __ aesdec(xmm_result, xmm_temp1);
2322    __ aesdec(xmm_result, xmm_temp2);
2323
2324    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2325    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2326
2327    __ cmpl(keylen, 52);
2328    __ jccb(Assembler::equal, L_doLast);
2329
2330    __ aesdec(xmm_result, xmm_temp1);
2331    __ aesdec(xmm_result, xmm_temp2);
2332
2333    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2334    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2335
2336    __ BIND(L_doLast);
2337    __ aesdec(xmm_result, xmm_temp1);
2338    __ aesdec(xmm_result, xmm_temp2);
2339
2340    // for decryption the aesdeclast operation is always on key+0x00
2341    __ aesdeclast(xmm_result, xmm_temp3);
2342    __ movdqu(Address(to, 0), xmm_result);  // store the result
2343    __ xorptr(rax, rax); // return 0
2344    __ leave(); // required for proper stackwalking of RuntimeStub frame
2345    __ ret(0);
2346
2347    return start;
2348  }
2349
2350  void handleSOERegisters(bool saving) {
2351    const int saveFrameSizeInBytes = 4 * wordSize;
2352    const Address saved_rbx     (rbp, -3 * wordSize);
2353    const Address saved_rsi     (rbp, -2 * wordSize);
2354    const Address saved_rdi     (rbp, -1 * wordSize);
2355
2356    if (saving) {
2357      __ subptr(rsp, saveFrameSizeInBytes);
2358      __ movptr(saved_rsi, rsi);
2359      __ movptr(saved_rdi, rdi);
2360      __ movptr(saved_rbx, rbx);
2361    } else {
2362      // restoring
2363      __ movptr(rsi, saved_rsi);
2364      __ movptr(rdi, saved_rdi);
2365      __ movptr(rbx, saved_rbx);
2366    }
2367  }
2368
2369  // Arguments:
2370  //
2371  // Inputs:
2372  //   c_rarg0   - source byte array address
2373  //   c_rarg1   - destination byte array address
2374  //   c_rarg2   - K (key) in little endian int array
2375  //   c_rarg3   - r vector byte array address
2376  //   c_rarg4   - input length
2377  //
2378  // Output:
2379  //   rax       - input length
2380  //
2381  address generate_cipherBlockChaining_encryptAESCrypt() {
2382    assert(UseAES, "need AES instructions and misaligned SSE support");
2383    __ align(CodeEntryAlignment);
2384    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2385    address start = __ pc();
2386
2387    Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
2388    const Register from        = rsi;      // source array address
2389    const Register to          = rdx;      // destination array address
2390    const Register key         = rcx;      // key array address
2391    const Register rvec        = rdi;      // r byte array initialized from initvector array address
2392                                           // and left with the results of the last encryption block
2393    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
2394    const Register pos         = rax;
2395
2396    // xmm register assignments for the loops below
2397    const XMMRegister xmm_result = xmm0;
2398    const XMMRegister xmm_temp   = xmm1;
2399    // first 6 keys preloaded into xmm2-xmm7
2400    const int XMM_REG_NUM_KEY_FIRST = 2;
2401    const int XMM_REG_NUM_KEY_LAST  = 7;
2402    const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
2403
2404    __ enter(); // required for proper stackwalking of RuntimeStub frame
2405    handleSOERegisters(true /*saving*/);
2406
2407    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
2408    // context for the registers used, where all instructions below are using 128-bit mode
2409    // On EVEX without VL and BW, these instructions will all be AVX.
2410    if (VM_Version::supports_avx512vlbw()) {
2411      __ movl(rdx, 0xffff);
2412      __ kmovdl(k1, rdx);
2413    }
2414
2415    // load registers from incoming parameters
2416    const Address  from_param(rbp, 8+0);
2417    const Address  to_param  (rbp, 8+4);
2418    const Address  key_param (rbp, 8+8);
2419    const Address  rvec_param (rbp, 8+12);
2420    const Address  len_param  (rbp, 8+16);
2421    __ movptr(from , from_param);
2422    __ movptr(to   , to_param);
2423    __ movptr(key  , key_param);
2424    __ movptr(rvec , rvec_param);
2425    __ movptr(len_reg , len_param);
2426
2427    const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
2428    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2429    // load up xmm regs 2 thru 7 with keys 0-5
2430    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2431      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
2432      offset += 0x10;
2433    }
2434
2435    __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
2436
2437    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2438    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2439    __ cmpl(rax, 44);
2440    __ jcc(Assembler::notEqual, L_key_192_256);
2441
2442    // 128 bit code follows here
2443    __ movl(pos, 0);
2444    __ align(OptoLoopAlignment);
2445    __ BIND(L_loopTop_128);
2446    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2447    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2448
2449    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2450    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2451      __ aesenc(xmm_result, as_XMMRegister(rnum));
2452    }
2453    for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) {
2454      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2455    }
2456    load_key(xmm_temp, key, 0xa0);
2457    __ aesenclast(xmm_result, xmm_temp);
2458
2459    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
2460    // no need to store r to memory until we exit
2461    __ addptr(pos, AESBlockSize);
2462    __ subptr(len_reg, AESBlockSize);
2463    __ jcc(Assembler::notEqual, L_loopTop_128);
2464
2465    __ BIND(L_exit);
2466    __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
2467
2468    handleSOERegisters(false /*restoring*/);
2469    __ movptr(rax, len_param); // return length
2470    __ leave();                                  // required for proper stackwalking of RuntimeStub frame
2471    __ ret(0);
2472
2473    __ BIND(L_key_192_256);
2474    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
2475    __ cmpl(rax, 52);
2476    __ jcc(Assembler::notEqual, L_key_256);
2477
2478    // 192-bit code follows here (could be changed to use more xmm registers)
2479    __ movl(pos, 0);
2480    __ align(OptoLoopAlignment);
2481    __ BIND(L_loopTop_192);
2482    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2483    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2484
2485    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2486    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2487      __ aesenc(xmm_result, as_XMMRegister(rnum));
2488    }
2489    for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) {
2490      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2491    }
2492    load_key(xmm_temp, key, 0xc0);
2493    __ aesenclast(xmm_result, xmm_temp);
2494
2495    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
2496    // no need to store r to memory until we exit
2497    __ addptr(pos, AESBlockSize);
2498    __ subptr(len_reg, AESBlockSize);
2499    __ jcc(Assembler::notEqual, L_loopTop_192);
2500    __ jmp(L_exit);
2501
2502    __ BIND(L_key_256);
2503    // 256-bit code follows here (could be changed to use more xmm registers)
2504    __ movl(pos, 0);
2505    __ align(OptoLoopAlignment);
2506    __ BIND(L_loopTop_256);
2507    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2508    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2509
2510    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2511    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2512      __ aesenc(xmm_result, as_XMMRegister(rnum));
2513    }
2514    for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) {
2515      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2516    }
2517    load_key(xmm_temp, key, 0xe0);
2518    __ aesenclast(xmm_result, xmm_temp);
2519
2520    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
2521    // no need to store r to memory until we exit
2522    __ addptr(pos, AESBlockSize);
2523    __ subptr(len_reg, AESBlockSize);
2524    __ jcc(Assembler::notEqual, L_loopTop_256);
2525    __ jmp(L_exit);
2526
2527    return start;
2528  }
2529
2530
2531  // CBC AES Decryption.
2532  // In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time.
2533  //
2534  // Arguments:
2535  //
2536  // Inputs:
2537  //   c_rarg0   - source byte array address
2538  //   c_rarg1   - destination byte array address
2539  //   c_rarg2   - K (key) in little endian int array
2540  //   c_rarg3   - r vector byte array address
2541  //   c_rarg4   - input length
2542  //
2543  // Output:
2544  //   rax       - input length
2545  //
2546
2547  address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
2548    assert(UseAES, "need AES instructions and misaligned SSE support");
2549    __ align(CodeEntryAlignment);
2550    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2551    address start = __ pc();
2552
2553    const Register from        = rsi;      // source array address
2554    const Register to          = rdx;      // destination array address
2555    const Register key         = rcx;      // key array address
2556    const Register rvec        = rdi;      // r byte array initialized from initvector array address
2557                                           // and left with the results of the last encryption block
2558    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
2559    const Register pos         = rax;
2560
2561    const int PARALLEL_FACTOR = 4;
2562    const int ROUNDS[3] = { 10, 12, 14 }; //aes rounds for key128, key192, key256
2563
2564    Label L_exit;
2565    Label L_singleBlock_loopTop[3]; //128, 192, 256
2566    Label L_multiBlock_loopTop[3]; //128, 192, 256
2567
2568    const XMMRegister xmm_prev_block_cipher = xmm0; // holds cipher of previous block
2569    const XMMRegister xmm_key_shuf_mask = xmm1;
2570
2571    const XMMRegister xmm_key_tmp0 = xmm2;
2572    const XMMRegister xmm_key_tmp1 = xmm3;
2573
2574    // registers holding the six results in the parallelized loop
2575    const XMMRegister xmm_result0 = xmm4;
2576    const XMMRegister xmm_result1 = xmm5;
2577    const XMMRegister xmm_result2 = xmm6;
2578    const XMMRegister xmm_result3 = xmm7;
2579
2580    __ enter(); // required for proper stackwalking of RuntimeStub frame
2581    handleSOERegisters(true /*saving*/);
2582
2583    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
2584    // context for the registers used, where all instructions below are using 128-bit mode
2585    // On EVEX without VL and BW, these instructions will all be AVX.
2586    if (VM_Version::supports_avx512vlbw()) {
2587      __ movl(rdx, 0xffff);
2588      __ kmovdl(k1, rdx);
2589    }
2590
2591    // load registers from incoming parameters
2592    const Address  from_param(rbp, 8+0);
2593    const Address  to_param  (rbp, 8+4);
2594    const Address  key_param (rbp, 8+8);
2595    const Address  rvec_param (rbp, 8+12);
2596    const Address  len_param  (rbp, 8+16);
2597
2598    __ movptr(from , from_param);
2599    __ movptr(to   , to_param);
2600    __ movptr(key  , key_param);
2601    __ movptr(rvec , rvec_param);
2602    __ movptr(len_reg , len_param);
2603
2604    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2605    __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
2606
2607    __ xorptr(pos, pos);
2608
2609    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2610    // rvec is reused
2611    __ movl(rvec, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2612    __ cmpl(rvec, 52);
2613    __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
2614    __ cmpl(rvec, 60);
2615    __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
2616
2617#define DoFour(opc, src_reg)           \
2618  __ opc(xmm_result0, src_reg);         \
2619  __ opc(xmm_result1, src_reg);         \
2620  __ opc(xmm_result2, src_reg);         \
2621  __ opc(xmm_result3, src_reg);         \
2622
2623    for (int k = 0; k < 3; ++k) {
2624      __ align(OptoLoopAlignment);
2625      __ BIND(L_multiBlock_loopTop[k]);
2626      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
2627      __ jcc(Assembler::less, L_singleBlock_loopTop[k]);
2628
2629      __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
2630      __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2631      __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2632      __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
2633
2634      // the java expanded key ordering is rotated one position from what we want
2635      // so we start from 0x10 here and hit 0x00 last
2636      load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
2637      DoFour(pxor, xmm_key_tmp0); //xor with first key
2638      // do the aes dec rounds
2639      for (int rnum = 1; rnum <= ROUNDS[k];) {
2640        //load two keys at a time
2641        //k1->0x20, ..., k9->0xa0, k10->0x00
2642        load_key(xmm_key_tmp1, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
2643        load_key(xmm_key_tmp0, key, ((rnum + 2) % (ROUNDS[k] + 1)) * 0x10, xmm_key_shuf_mask); // hit 0x00 last!
2644        DoFour(aesdec, xmm_key_tmp1);
2645        rnum++;
2646        if (rnum != ROUNDS[k]) {
2647          DoFour(aesdec, xmm_key_tmp0);
2648        }
2649        else {
2650          DoFour(aesdeclast, xmm_key_tmp0);
2651        }
2652        rnum++;
2653      }
2654
2655      // for each result, xor with the r vector of previous cipher block
2656      __ pxor(xmm_result0, xmm_prev_block_cipher);
2657      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2658      __ pxor(xmm_result1, xmm_prev_block_cipher);
2659      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2660      __ pxor(xmm_result2, xmm_prev_block_cipher);
2661      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2662      __ pxor(xmm_result3, xmm_prev_block_cipher);
2663      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks
2664
2665            // store 4 results into the next 64 bytes of output
2666       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2667       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
2668       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
2669       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
2670
2671       __ addptr(pos, 4 * AESBlockSize);
2672       __ subptr(len_reg, 4 * AESBlockSize);
2673       __ jmp(L_multiBlock_loopTop[k]);
2674
2675       //singleBlock starts here
2676       __ align(OptoLoopAlignment);
2677       __ BIND(L_singleBlock_loopTop[k]);
2678       __ cmpptr(len_reg, 0); // any blocks left?
2679       __ jcc(Assembler::equal, L_exit);
2680       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
2681       __ movdqa(xmm_result1, xmm_result0);
2682
2683       load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
2684       __ pxor(xmm_result0, xmm_key_tmp0);
2685       // do the aes dec rounds
2686       for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
2687         // the java expanded key ordering is rotated one position from what we want
2688         load_key(xmm_key_tmp0, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
2689         __ aesdec(xmm_result0, xmm_key_tmp0);
2690       }
2691       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
2692       __ aesdeclast(xmm_result0, xmm_key_tmp0);
2693       __ pxor(xmm_result0, xmm_prev_block_cipher); // xor with the current r vector
2694       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result0); // store into the next 16 bytes of output
2695       // no need to store r to memory until we exit
2696       __ movdqa(xmm_prev_block_cipher, xmm_result1); // set up next r vector with cipher input from this block
2697
2698       __ addptr(pos, AESBlockSize);
2699       __ subptr(len_reg, AESBlockSize);
2700       __ jmp(L_singleBlock_loopTop[k]);
2701    }//for 128/192/256
2702
2703    __ BIND(L_exit);
2704    __ movptr(rvec, rvec_param);                        // restore this since reused earlier
2705    __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
2706    handleSOERegisters(false /*restoring*/);
2707    __ movptr(rax, len_param);                          // return length
2708    __ leave();                                         // required for proper stackwalking of RuntimeStub frame
2709    __ ret(0);
2710
2711    return start;
2712  }
2713
2714  // CTR AES crypt.
2715  // In 32-bit stub, parallelize 4 blocks at a time
2716  // Arguments:
2717  //
2718  // Inputs:
2719  //   c_rarg0   - source byte array address
2720  //   c_rarg1   - destination byte array address
2721  //   c_rarg2   - K (key) in little endian int array
2722  //   c_rarg3   - counter vector byte array address
2723  //   c_rarg4   - input length
2724  //
2725  // Output:
2726  //   rax       - input length
2727  //
2728  address generate_counterMode_AESCrypt_Parallel() {
2729    assert(UseAES, "need AES instructions and misaligned SSE support");
2730    __ align(CodeEntryAlignment);
2731    StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2732    address start = __ pc();
2733    const Register from        = rsi;      // source array address
2734    const Register to          = rdx;      // destination array address
2735    const Register key         = rcx;      // key array address
2736    const Register counter     = rdi;      // counter byte array initialized from initvector array address
2737                                           // and updated with the incremented counter in the end
2738    const Register len_reg     = rbx;
2739    const Register pos         = rax;
2740
2741    __ enter(); // required for proper stackwalking of RuntimeStub frame
2742    handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi
2743
2744    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
2745    // context for the registers used, where all instructions below are using 128-bit mode
2746    // On EVEX without VL and BW, these instructions will all be AVX.
2747    if (VM_Version::supports_avx512vlbw()) {
2748      __ movl(rdx, 0xffff);
2749      __ kmovdl(k1, rdx);
2750    }
2751
2752    // load registers from incoming parameters
2753    const Address  from_param(rbp, 8+0);
2754    const Address  to_param  (rbp, 8+4);
2755    const Address  key_param (rbp, 8+8);
2756    const Address  rvec_param (rbp, 8+12);
2757    const Address  len_param  (rbp, 8+16);
2758    const Address  saved_counter_param(rbp, 8 + 20);
2759    const Address  used_addr_param(rbp, 8 + 24);
2760
2761    __ movptr(from , from_param);
2762    __ movptr(to   , to_param);
2763    __ movptr(len_reg , len_param);
2764
2765    // Use the partially used encrpyted counter from last invocation
2766    Label L_exit_preLoop, L_preLoop_start;
2767
2768    // Use the registers 'counter' and 'key' here in this preloop
2769    // to hold of last 2 params 'used' and 'saved_encCounter_start'
2770    Register used = counter;
2771    Register saved_encCounter_start = key;
2772    Register used_addr = saved_encCounter_start;
2773
2774    __ movptr(used_addr, used_addr_param);
2775    __ movptr(used, Address(used_addr, 0));
2776    __ movptr(saved_encCounter_start, saved_counter_param);
2777
2778    __ BIND(L_preLoop_start);
2779    __ cmpptr(used, 16);
2780    __ jcc(Assembler::aboveEqual, L_exit_preLoop);
2781    __ cmpptr(len_reg, 0);
2782    __ jcc(Assembler::lessEqual, L_exit_preLoop);
2783    __ movb(rax, Address(saved_encCounter_start, used));
2784    __ xorb(rax, Address(from, 0));
2785    __ movb(Address(to, 0), rax);
2786    __ addptr(from, 1);
2787    __ addptr(to, 1);
2788    __ addptr(used, 1);
2789    __ subptr(len_reg, 1);
2790
2791    __ jmp(L_preLoop_start);
2792
2793    __ BIND(L_exit_preLoop);
2794    __ movptr(used_addr, used_addr_param);
2795    __ movptr(used_addr, used_addr_param);
2796    __ movl(Address(used_addr, 0), used);
2797
2798    // load the parameters 'key' and 'counter'
2799    __ movptr(key, key_param);
2800    __ movptr(counter, rvec_param);
2801
2802    // xmm register assignments for the loops below
2803    const XMMRegister xmm_curr_counter      = xmm0;
2804    const XMMRegister xmm_counter_shuf_mask = xmm1;  // need to be reloaded
2805    const XMMRegister xmm_key_shuf_mask     = xmm2;  // need to be reloaded
2806    const XMMRegister xmm_key               = xmm3;
2807    const XMMRegister xmm_result0           = xmm4;
2808    const XMMRegister xmm_result1           = xmm5;
2809    const XMMRegister xmm_result2           = xmm6;
2810    const XMMRegister xmm_result3           = xmm7;
2811    const XMMRegister xmm_from0             = xmm1;   //reuse XMM register
2812    const XMMRegister xmm_from1             = xmm2;
2813    const XMMRegister xmm_from2             = xmm3;
2814    const XMMRegister xmm_from3             = xmm4;
2815
2816    //for key_128, key_192, key_256
2817    const int rounds[3] = {10, 12, 14};
2818    Label L_singleBlockLoopTop[3];
2819    Label L_multiBlock_loopTop[3];
2820    Label L_key192_top, L_key256_top;
2821    Label L_incCounter[3][4]; // 3: different key length,  4: 4 blocks at a time
2822    Label L_incCounter_single[3]; //for single block, key128, key192, key256
2823    Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
2824    Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
2825
2826    Label L_exit;
2827    const int PARALLEL_FACTOR = 4;  //because of the limited register number
2828
2829    // initialize counter with initial counter
2830    __ movdqu(xmm_curr_counter, Address(counter, 0x00));
2831    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2832    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled for increase
2833
2834    // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2835    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2836    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2837    __ cmpl(rax, 52);
2838    __ jcc(Assembler::equal, L_key192_top);
2839    __ cmpl(rax, 60);
2840    __ jcc(Assembler::equal, L_key256_top);
2841
2842    //key128 begins here
2843    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
2844
2845#define CTR_DoFour(opc, src_reg)               \
2846    __ opc(xmm_result0, src_reg);              \
2847    __ opc(xmm_result1, src_reg);              \
2848    __ opc(xmm_result2, src_reg);              \
2849    __ opc(xmm_result3, src_reg);
2850
2851    // k == 0 :  generate code for key_128
2852    // k == 1 :  generate code for key_192
2853    // k == 2 :  generate code for key_256
2854    for (int k = 0; k < 3; ++k) {
2855      //multi blocks starts here
2856      __ align(OptoLoopAlignment);
2857      __ BIND(L_multiBlock_loopTop[k]);
2858      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
2859      __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
2860
2861      __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2862      __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2863
2864      //load, then increase counters
2865      CTR_DoFour(movdqa, xmm_curr_counter);
2866      __ push(rbx);
2867      inc_counter(rbx, xmm_result1, 0x01, L_incCounter[k][0]);
2868      inc_counter(rbx, xmm_result2, 0x02, L_incCounter[k][1]);
2869      inc_counter(rbx, xmm_result3, 0x03, L_incCounter[k][2]);
2870      inc_counter(rbx, xmm_curr_counter, 0x04, L_incCounter[k][3]);
2871      __ pop (rbx);
2872
2873      load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); // load Round 0 key. interleaving for better performance
2874
2875      CTR_DoFour(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
2876      CTR_DoFour(pxor, xmm_key);   //PXOR with Round 0 key
2877
2878      for (int i = 1; i < rounds[k]; ++i) {
2879        load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
2880        CTR_DoFour(aesenc, xmm_key);
2881      }
2882      load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
2883      CTR_DoFour(aesenclast, xmm_key);
2884
2885      // get next PARALLEL_FACTOR blocks into xmm_from registers
2886      __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2887      __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2888      __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2889
2890      // PXOR with input text
2891      __ pxor(xmm_result0, xmm_from0); //result0 is xmm4
2892      __ pxor(xmm_result1, xmm_from1);
2893      __ pxor(xmm_result2, xmm_from2);
2894
2895      // store PARALLEL_FACTOR results into the next 64 bytes of output
2896      __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2897      __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
2898      __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
2899
2900      // do it here after xmm_result0 is saved, because xmm_from3 reuse the same register of xmm_result0.
2901      __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
2902      __ pxor(xmm_result3, xmm_from3);
2903      __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
2904
2905      __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
2906      __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
2907      __ jmp(L_multiBlock_loopTop[k]);
2908
2909      // singleBlock starts here
2910      __ align(OptoLoopAlignment);
2911      __ BIND(L_singleBlockLoopTop[k]);
2912      __ cmpptr(len_reg, 0);
2913      __ jcc(Assembler::equal, L_exit);
2914      __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2915      __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2916      __ movdqa(xmm_result0, xmm_curr_counter);
2917      load_key(xmm_key, key, 0x00, xmm_key_shuf_mask);
2918      __ push(rbx);//rbx is used for increasing counter
2919      inc_counter(rbx, xmm_curr_counter, 0x01, L_incCounter_single[k]);
2920      __ pop (rbx);
2921      __ pshufb(xmm_result0, xmm_counter_shuf_mask);
2922      __ pxor(xmm_result0, xmm_key);
2923      for (int i = 1; i < rounds[k]; i++) {
2924        load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
2925        __ aesenc(xmm_result0, xmm_key);
2926      }
2927      load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
2928      __ aesenclast(xmm_result0, xmm_key);
2929      __ cmpptr(len_reg, AESBlockSize);
2930      __ jcc(Assembler::less, L_processTail_insr[k]);
2931        __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2932        __ pxor(xmm_result0, xmm_from0);
2933        __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2934        __ addptr(pos, AESBlockSize);
2935        __ subptr(len_reg, AESBlockSize);
2936        __ jmp(L_singleBlockLoopTop[k]);
2937
2938      __ BIND(L_processTail_insr[k]);                                               // Process the tail part of the input array
2939        __ addptr(pos, len_reg);                                                    // 1. Insert bytes from src array into xmm_from0 register
2940        __ testptr(len_reg, 8);
2941        __ jcc(Assembler::zero, L_processTail_4_insr[k]);
2942          __ subptr(pos,8);
2943          __ pinsrd(xmm_from0, Address(from, pos), 0);
2944          __ pinsrd(xmm_from0, Address(from, pos, Address::times_1, 4), 1);
2945        __ BIND(L_processTail_4_insr[k]);
2946        __ testptr(len_reg, 4);
2947        __ jcc(Assembler::zero, L_processTail_2_insr[k]);
2948          __ subptr(pos,4);
2949          __ pslldq(xmm_from0, 4);
2950          __ pinsrd(xmm_from0, Address(from, pos), 0);
2951        __ BIND(L_processTail_2_insr[k]);
2952        __ testptr(len_reg, 2);
2953        __ jcc(Assembler::zero, L_processTail_1_insr[k]);
2954          __ subptr(pos, 2);
2955          __ pslldq(xmm_from0, 2);
2956          __ pinsrw(xmm_from0, Address(from, pos), 0);
2957        __ BIND(L_processTail_1_insr[k]);
2958        __ testptr(len_reg, 1);
2959        __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
2960          __ subptr(pos, 1);
2961          __ pslldq(xmm_from0, 1);
2962          __ pinsrb(xmm_from0, Address(from, pos), 0);
2963        __ BIND(L_processTail_exit_insr[k]);
2964
2965        __ movptr(saved_encCounter_start, saved_counter_param);
2966        __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);               // 2. Perform pxor of the encrypted counter and plaintext Bytes.
2967        __ pxor(xmm_result0, xmm_from0);                                          //    Also the encrypted counter is saved for next invocation.
2968
2969        __ testptr(len_reg, 8);
2970        __ jcc(Assembler::zero, L_processTail_4_extr[k]);                        // 3. Extract bytes from xmm_result0 into the dest. array
2971          __ pextrd(Address(to, pos), xmm_result0, 0);
2972          __ pextrd(Address(to, pos, Address::times_1, 4), xmm_result0, 1);
2973          __ psrldq(xmm_result0, 8);
2974          __ addptr(pos, 8);
2975        __ BIND(L_processTail_4_extr[k]);
2976        __ testptr(len_reg, 4);
2977        __ jcc(Assembler::zero, L_processTail_2_extr[k]);
2978          __ pextrd(Address(to, pos), xmm_result0, 0);
2979          __ psrldq(xmm_result0, 4);
2980          __ addptr(pos, 4);
2981        __ BIND(L_processTail_2_extr[k]);
2982        __ testptr(len_reg, 2);
2983        __ jcc(Assembler::zero, L_processTail_1_extr[k]);
2984          __ pextrb(Address(to, pos), xmm_result0, 0);
2985          __ pextrb(Address(to, pos, Address::times_1, 1), xmm_result0, 1);
2986          __ psrldq(xmm_result0, 2);
2987          __ addptr(pos, 2);
2988        __ BIND(L_processTail_1_extr[k]);
2989        __ testptr(len_reg, 1);
2990        __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
2991          __ pextrb(Address(to, pos), xmm_result0, 0);
2992
2993        __ BIND(L_processTail_exit_extr[k]);
2994        __ movptr(used_addr, used_addr_param);
2995        __ movl(Address(used_addr, 0), len_reg);
2996        __ jmp(L_exit);
2997    }
2998
2999    __ BIND(L_exit);
3000    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
3001    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
3002    __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
3003    handleSOERegisters(false /*restoring*/);
3004    __ movptr(rax, len_param); // return length
3005    __ leave();                // required for proper stackwalking of RuntimeStub frame
3006    __ ret(0);
3007
3008    __ BIND (L_key192_top);
3009    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
3010    __ jmp(L_multiBlock_loopTop[1]); //key192
3011
3012    __ BIND (L_key256_top);
3013    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
3014    __ jmp(L_multiBlock_loopTop[2]); //key192
3015
3016    return start;
3017  }
3018
3019  address generate_upper_word_mask() {
3020    __ align(64);
3021    StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3022    address start = __ pc();
3023    __ emit_data(0x00000000, relocInfo::none, 0);
3024    __ emit_data(0x00000000, relocInfo::none, 0);
3025    __ emit_data(0x00000000, relocInfo::none, 0);
3026    __ emit_data(0xFFFFFFFF, relocInfo::none, 0);
3027    return start;
3028  }
3029
3030  address generate_shuffle_byte_flip_mask() {
3031    __ align(64);
3032    StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3033    address start = __ pc();
3034    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3035    __ emit_data(0x08090a0b, relocInfo::none, 0);
3036    __ emit_data(0x04050607, relocInfo::none, 0);
3037    __ emit_data(0x00010203, relocInfo::none, 0);
3038    return start;
3039  }
3040
3041  // ofs and limit are use for multi-block byte array.
3042  // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3043  address generate_sha1_implCompress(bool multi_block, const char *name) {
3044    __ align(CodeEntryAlignment);
3045    StubCodeMark mark(this, "StubRoutines", name);
3046    address start = __ pc();
3047
3048    Register buf   = rax;
3049    Register state = rdx;
3050    Register ofs   = rcx;
3051    Register limit = rdi;
3052
3053    const Address  buf_param(rbp, 8 + 0);
3054    const Address  state_param(rbp, 8 + 4);
3055    const Address  ofs_param(rbp, 8 + 8);
3056    const Address  limit_param(rbp, 8 + 12);
3057
3058    const XMMRegister abcd = xmm0;
3059    const XMMRegister e0 = xmm1;
3060    const XMMRegister e1 = xmm2;
3061    const XMMRegister msg0 = xmm3;
3062
3063    const XMMRegister msg1 = xmm4;
3064    const XMMRegister msg2 = xmm5;
3065    const XMMRegister msg3 = xmm6;
3066    const XMMRegister shuf_mask = xmm7;
3067
3068    __ enter();
3069    __ subptr(rsp, 8 * wordSize);
3070    if (multi_block) {
3071      __ push(limit);
3072    }
3073    __ movptr(buf, buf_param);
3074    __ movptr(state, state_param);
3075    if (multi_block) {
3076      __ movptr(ofs, ofs_param);
3077      __ movptr(limit, limit_param);
3078    }
3079
3080    __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3081      buf, state, ofs, limit, rsp, multi_block);
3082
3083    if (multi_block) {
3084      __ pop(limit);
3085    }
3086    __ addptr(rsp, 8 * wordSize);
3087    __ leave();
3088    __ ret(0);
3089    return start;
3090  }
3091
3092  address generate_pshuffle_byte_flip_mask() {
3093    __ align(64);
3094    StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3095    address start = __ pc();
3096    __ emit_data(0x00010203, relocInfo::none, 0);
3097    __ emit_data(0x04050607, relocInfo::none, 0);
3098    __ emit_data(0x08090a0b, relocInfo::none, 0);
3099    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3100    return start;
3101  }
3102
3103  // ofs and limit are use for multi-block byte array.
3104  // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3105 address generate_sha256_implCompress(bool multi_block, const char *name) {
3106    __ align(CodeEntryAlignment);
3107    StubCodeMark mark(this, "StubRoutines", name);
3108    address start = __ pc();
3109
3110    Register buf = rbx;
3111    Register state = rsi;
3112    Register ofs = rdx;
3113    Register limit = rcx;
3114
3115    const Address  buf_param(rbp, 8 + 0);
3116    const Address  state_param(rbp, 8 + 4);
3117    const Address  ofs_param(rbp, 8 + 8);
3118    const Address  limit_param(rbp, 8 + 12);
3119
3120    const XMMRegister msg = xmm0;
3121    const XMMRegister state0 = xmm1;
3122    const XMMRegister state1 = xmm2;
3123    const XMMRegister msgtmp0 = xmm3;
3124
3125    const XMMRegister msgtmp1 = xmm4;
3126    const XMMRegister msgtmp2 = xmm5;
3127    const XMMRegister msgtmp3 = xmm6;
3128    const XMMRegister msgtmp4 = xmm7;
3129
3130    __ enter();
3131    __ subptr(rsp, 8 * wordSize);
3132    handleSOERegisters(true /*saving*/);
3133    __ movptr(buf, buf_param);
3134    __ movptr(state, state_param);
3135    if (multi_block) {
3136     __ movptr(ofs, ofs_param);
3137     __ movptr(limit, limit_param);
3138    }
3139
3140    __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3141      buf, state, ofs, limit, rsp, multi_block);
3142
3143    handleSOERegisters(false);
3144    __ addptr(rsp, 8 * wordSize);
3145    __ leave();
3146    __ ret(0);
3147    return start;
3148  }
3149
3150  // byte swap x86 long
3151  address generate_ghash_long_swap_mask() {
3152    __ align(CodeEntryAlignment);
3153    StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
3154    address start = __ pc();
3155    __ emit_data(0x0b0a0908, relocInfo::none, 0);
3156    __ emit_data(0x0f0e0d0c, relocInfo::none, 0);
3157    __ emit_data(0x03020100, relocInfo::none, 0);
3158    __ emit_data(0x07060504, relocInfo::none, 0);
3159
3160  return start;
3161  }
3162
3163  // byte swap x86 byte array
3164  address generate_ghash_byte_swap_mask() {
3165    __ align(CodeEntryAlignment);
3166    StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
3167    address start = __ pc();
3168    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3169    __ emit_data(0x08090a0b, relocInfo::none, 0);
3170    __ emit_data(0x04050607, relocInfo::none, 0);
3171    __ emit_data(0x00010203, relocInfo::none, 0);
3172  return start;
3173  }
3174
3175  /* Single and multi-block ghash operations */
3176  address generate_ghash_processBlocks() {
3177    assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
3178    __ align(CodeEntryAlignment);
3179    Label L_ghash_loop, L_exit;
3180    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3181    address start = __ pc();
3182
3183    const Register state        = rdi;
3184    const Register subkeyH      = rsi;
3185    const Register data         = rdx;
3186    const Register blocks       = rcx;
3187
3188    const Address  state_param(rbp, 8+0);
3189    const Address  subkeyH_param(rbp, 8+4);
3190    const Address  data_param(rbp, 8+8);
3191    const Address  blocks_param(rbp, 8+12);
3192
3193    const XMMRegister xmm_temp0 = xmm0;
3194    const XMMRegister xmm_temp1 = xmm1;
3195    const XMMRegister xmm_temp2 = xmm2;
3196    const XMMRegister xmm_temp3 = xmm3;
3197    const XMMRegister xmm_temp4 = xmm4;
3198    const XMMRegister xmm_temp5 = xmm5;
3199    const XMMRegister xmm_temp6 = xmm6;
3200    const XMMRegister xmm_temp7 = xmm7;
3201
3202    __ enter();
3203    handleSOERegisters(true);  // Save registers
3204
3205    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3206    // context for the registers used, where all instructions below are using 128-bit mode
3207    // On EVEX without VL and BW, these instructions will all be AVX.
3208    if (VM_Version::supports_avx512vlbw()) {
3209      __ movl(rdx, 0xffff);
3210      __ kmovdl(k1, rdx);
3211    }
3212
3213    __ movptr(state, state_param);
3214    __ movptr(subkeyH, subkeyH_param);
3215    __ movptr(data, data_param);
3216    __ movptr(blocks, blocks_param);
3217
3218    __ movdqu(xmm_temp0, Address(state, 0));
3219    __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3220
3221    __ movdqu(xmm_temp1, Address(subkeyH, 0));
3222    __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3223
3224    __ BIND(L_ghash_loop);
3225    __ movdqu(xmm_temp2, Address(data, 0));
3226    __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
3227
3228    __ pxor(xmm_temp0, xmm_temp2);
3229
3230    //
3231    // Multiply with the hash key
3232    //
3233    __ movdqu(xmm_temp3, xmm_temp0);
3234    __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
3235    __ movdqu(xmm_temp4, xmm_temp0);
3236    __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
3237
3238    __ movdqu(xmm_temp5, xmm_temp0);
3239    __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
3240    __ movdqu(xmm_temp6, xmm_temp0);
3241    __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
3242
3243    __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
3244
3245    __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
3246    __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
3247    __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
3248    __ pxor(xmm_temp3, xmm_temp5);
3249    __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
3250                                        // of the carry-less multiplication of
3251                                        // xmm0 by xmm1.
3252
3253    // We shift the result of the multiplication by one bit position
3254    // to the left to cope for the fact that the bits are reversed.
3255    __ movdqu(xmm_temp7, xmm_temp3);
3256    __ movdqu(xmm_temp4, xmm_temp6);
3257    __ pslld (xmm_temp3, 1);
3258    __ pslld(xmm_temp6, 1);
3259    __ psrld(xmm_temp7, 31);
3260    __ psrld(xmm_temp4, 31);
3261    __ movdqu(xmm_temp5, xmm_temp7);
3262    __ pslldq(xmm_temp4, 4);
3263    __ pslldq(xmm_temp7, 4);
3264    __ psrldq(xmm_temp5, 12);
3265    __ por(xmm_temp3, xmm_temp7);
3266    __ por(xmm_temp6, xmm_temp4);
3267    __ por(xmm_temp6, xmm_temp5);
3268
3269    //
3270    // First phase of the reduction
3271    //
3272    // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts
3273    // independently.
3274    __ movdqu(xmm_temp7, xmm_temp3);
3275    __ movdqu(xmm_temp4, xmm_temp3);
3276    __ movdqu(xmm_temp5, xmm_temp3);
3277    __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
3278    __ pslld(xmm_temp4, 30);    // packed right shift shifting << 30
3279    __ pslld(xmm_temp5, 25);    // packed right shift shifting << 25
3280    __ pxor(xmm_temp7, xmm_temp4);      // xor the shifted versions
3281    __ pxor(xmm_temp7, xmm_temp5);
3282    __ movdqu(xmm_temp4, xmm_temp7);
3283    __ pslldq(xmm_temp7, 12);
3284    __ psrldq(xmm_temp4, 4);
3285    __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
3286
3287    //
3288    // Second phase of the reduction
3289    //
3290    // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these
3291    // shift operations.
3292    __ movdqu(xmm_temp2, xmm_temp3);
3293    __ movdqu(xmm_temp7, xmm_temp3);
3294    __ movdqu(xmm_temp5, xmm_temp3);
3295    __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
3296    __ psrld(xmm_temp7, 2);     // packed left shifting >> 2
3297    __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
3298    __ pxor(xmm_temp2, xmm_temp7);      // xor the shifted versions
3299    __ pxor(xmm_temp2, xmm_temp5);
3300    __ pxor(xmm_temp2, xmm_temp4);
3301    __ pxor(xmm_temp3, xmm_temp2);
3302    __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
3303
3304    __ decrement(blocks);
3305    __ jcc(Assembler::zero, L_exit);
3306    __ movdqu(xmm_temp0, xmm_temp6);
3307    __ addptr(data, 16);
3308    __ jmp(L_ghash_loop);
3309
3310    __ BIND(L_exit);
3311       // Byte swap 16-byte result
3312    __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3313    __ movdqu(Address(state, 0), xmm_temp6);   // store the result
3314
3315    handleSOERegisters(false);  // restore registers
3316    __ leave();
3317    __ ret(0);
3318    return start;
3319  }
3320
3321  /**
3322   *  Arguments:
3323   *
3324   * Inputs:
3325   *   rsp(4)   - int crc
3326   *   rsp(8)   - byte* buf
3327   *   rsp(12)  - int length
3328   *
3329   * Ouput:
3330   *       rax   - int crc result
3331   */
3332  address generate_updateBytesCRC32() {
3333    assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
3334
3335    __ align(CodeEntryAlignment);
3336    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3337
3338    address start = __ pc();
3339
3340    const Register crc   = rdx;  // crc
3341    const Register buf   = rsi;  // source java byte array address
3342    const Register len   = rcx;  // length
3343    const Register table = rdi;  // crc_table address (reuse register)
3344    const Register tmp   = rbx;
3345    assert_different_registers(crc, buf, len, table, tmp, rax);
3346
3347    BLOCK_COMMENT("Entry:");
3348    __ enter(); // required for proper stackwalking of RuntimeStub frame
3349    __ push(rsi);
3350    __ push(rdi);
3351    __ push(rbx);
3352
3353    Address crc_arg(rbp, 8 + 0);
3354    Address buf_arg(rbp, 8 + 4);
3355    Address len_arg(rbp, 8 + 8);
3356
3357    // Load up:
3358    __ movl(crc,   crc_arg);
3359    __ movptr(buf, buf_arg);
3360    __ movl(len,   len_arg);
3361
3362    __ kernel_crc32(crc, buf, len, table, tmp);
3363
3364    __ movl(rax, crc);
3365    __ pop(rbx);
3366    __ pop(rdi);
3367    __ pop(rsi);
3368    __ leave(); // required for proper stackwalking of RuntimeStub frame
3369    __ ret(0);
3370
3371    return start;
3372  }
3373
3374  /**
3375  *  Arguments:
3376  *
3377  * Inputs:
3378  *   rsp(4)   - int crc
3379  *   rsp(8)   - byte* buf
3380  *   rsp(12)  - int length
3381  *   rsp(16)  - table_start - optional (present only when doing a library_calll,
3382  *              not used by x86 algorithm)
3383  *
3384  * Ouput:
3385  *       rax  - int crc result
3386  */
3387  address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
3388    assert(UseCRC32CIntrinsics, "need SSE4_2");
3389    __ align(CodeEntryAlignment);
3390    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3391    address start = __ pc();
3392    const Register crc = rax;  // crc
3393    const Register buf = rcx;  // source java byte array address
3394    const Register len = rdx;  // length
3395    const Register d = rbx;
3396    const Register g = rsi;
3397    const Register h = rdi;
3398    const Register empty = 0; // will never be used, in order not
3399                              // to change a signature for crc32c_IPL_Alg2_Alt2
3400                              // between 64/32 I'm just keeping it here
3401    assert_different_registers(crc, buf, len, d, g, h);
3402
3403    BLOCK_COMMENT("Entry:");
3404    __ enter(); // required for proper stackwalking of RuntimeStub frame
3405    Address crc_arg(rsp, 4 + 4 + 0); // ESP+4 +
3406                                     // we need to add additional 4 because __ enter
3407                                     // have just pushed ebp on a stack
3408    Address buf_arg(rsp, 4 + 4 + 4);
3409    Address len_arg(rsp, 4 + 4 + 8);
3410      // Load up:
3411      __ movl(crc, crc_arg);
3412      __ movl(buf, buf_arg);
3413      __ movl(len, len_arg);
3414      __ push(d);
3415      __ push(g);
3416      __ push(h);
3417      __ crc32c_ipl_alg2_alt2(crc, buf, len,
3418                              d, g, h,
3419                              empty, empty, empty,
3420                              xmm0, xmm1, xmm2,
3421                              is_pclmulqdq_supported);
3422      __ pop(h);
3423      __ pop(g);
3424      __ pop(d);
3425    __ leave(); // required for proper stackwalking of RuntimeStub frame
3426    __ ret(0);
3427
3428    return start;
3429  }
3430
3431 address generate_libmExp() {
3432    address start = __ pc();
3433
3434    const XMMRegister x0  = xmm0;
3435    const XMMRegister x1  = xmm1;
3436    const XMMRegister x2  = xmm2;
3437    const XMMRegister x3  = xmm3;
3438
3439    const XMMRegister x4  = xmm4;
3440    const XMMRegister x5  = xmm5;
3441    const XMMRegister x6  = xmm6;
3442    const XMMRegister x7  = xmm7;
3443
3444    const Register tmp   = rbx;
3445
3446    BLOCK_COMMENT("Entry:");
3447    __ enter(); // required for proper stackwalking of RuntimeStub frame
3448    __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3449    __ leave(); // required for proper stackwalking of RuntimeStub frame
3450    __ ret(0);
3451
3452    return start;
3453
3454  }
3455
3456 address generate_libmLog() {
3457   address start = __ pc();
3458
3459   const XMMRegister x0 = xmm0;
3460   const XMMRegister x1 = xmm1;
3461   const XMMRegister x2 = xmm2;
3462   const XMMRegister x3 = xmm3;
3463
3464   const XMMRegister x4 = xmm4;
3465   const XMMRegister x5 = xmm5;
3466   const XMMRegister x6 = xmm6;
3467   const XMMRegister x7 = xmm7;
3468
3469   const Register tmp = rbx;
3470
3471   BLOCK_COMMENT("Entry:");
3472   __ enter(); // required for proper stackwalking of RuntimeStub frame
3473   __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3474   __ leave(); // required for proper stackwalking of RuntimeStub frame
3475   __ ret(0);
3476
3477   return start;
3478
3479 }
3480
3481 address generate_libmLog10() {
3482   address start = __ pc();
3483
3484   const XMMRegister x0 = xmm0;
3485   const XMMRegister x1 = xmm1;
3486   const XMMRegister x2 = xmm2;
3487   const XMMRegister x3 = xmm3;
3488
3489   const XMMRegister x4 = xmm4;
3490   const XMMRegister x5 = xmm5;
3491   const XMMRegister x6 = xmm6;
3492   const XMMRegister x7 = xmm7;
3493
3494   const Register tmp = rbx;
3495
3496   BLOCK_COMMENT("Entry:");
3497   __ enter(); // required for proper stackwalking of RuntimeStub frame
3498   __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3499   __ leave(); // required for proper stackwalking of RuntimeStub frame
3500   __ ret(0);
3501
3502   return start;
3503
3504 }
3505
3506 address generate_libmPow() {
3507   address start = __ pc();
3508
3509   const XMMRegister x0 = xmm0;
3510   const XMMRegister x1 = xmm1;
3511   const XMMRegister x2 = xmm2;
3512   const XMMRegister x3 = xmm3;
3513
3514   const XMMRegister x4 = xmm4;
3515   const XMMRegister x5 = xmm5;
3516   const XMMRegister x6 = xmm6;
3517   const XMMRegister x7 = xmm7;
3518
3519   const Register tmp = rbx;
3520
3521   BLOCK_COMMENT("Entry:");
3522   __ enter(); // required for proper stackwalking of RuntimeStub frame
3523   __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3524   __ leave(); // required for proper stackwalking of RuntimeStub frame
3525   __ ret(0);
3526
3527   return start;
3528
3529 }
3530
3531 address generate_libm_reduce_pi04l() {
3532   address start = __ pc();
3533
3534   BLOCK_COMMENT("Entry:");
3535   __ libm_reduce_pi04l(rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3536
3537   return start;
3538
3539 }
3540
3541 address generate_libm_sin_cos_huge() {
3542   address start = __ pc();
3543
3544   const XMMRegister x0 = xmm0;
3545   const XMMRegister x1 = xmm1;
3546
3547   BLOCK_COMMENT("Entry:");
3548   __ libm_sincos_huge(x0, x1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3549
3550   return start;
3551
3552 }
3553
3554 address generate_libmSin() {
3555   address start = __ pc();
3556
3557   const XMMRegister x0 = xmm0;
3558   const XMMRegister x1 = xmm1;
3559   const XMMRegister x2 = xmm2;
3560   const XMMRegister x3 = xmm3;
3561
3562   const XMMRegister x4 = xmm4;
3563   const XMMRegister x5 = xmm5;
3564   const XMMRegister x6 = xmm6;
3565   const XMMRegister x7 = xmm7;
3566
3567   BLOCK_COMMENT("Entry:");
3568   __ enter(); // required for proper stackwalking of RuntimeStub frame
3569   __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rdx);
3570   __ leave(); // required for proper stackwalking of RuntimeStub frame
3571   __ ret(0);
3572
3573   return start;
3574
3575 }
3576
3577 address generate_libmCos() {
3578   address start = __ pc();
3579
3580   const XMMRegister x0 = xmm0;
3581   const XMMRegister x1 = xmm1;
3582   const XMMRegister x2 = xmm2;
3583   const XMMRegister x3 = xmm3;
3584
3585   const XMMRegister x4 = xmm4;
3586   const XMMRegister x5 = xmm5;
3587   const XMMRegister x6 = xmm6;
3588   const XMMRegister x7 = xmm7;
3589
3590   const Register tmp = rbx;
3591
3592   BLOCK_COMMENT("Entry:");
3593   __ enter(); // required for proper stackwalking of RuntimeStub frame
3594   __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3595   __ leave(); // required for proper stackwalking of RuntimeStub frame
3596   __ ret(0);
3597
3598   return start;
3599
3600 }
3601
3602 address generate_libm_tan_cot_huge() {
3603   address start = __ pc();
3604
3605   const XMMRegister x0 = xmm0;
3606   const XMMRegister x1 = xmm1;
3607
3608   BLOCK_COMMENT("Entry:");
3609   __ libm_tancot_huge(x0, x1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3610
3611   return start;
3612
3613 }
3614
3615 address generate_libmTan() {
3616   address start = __ pc();
3617
3618   const XMMRegister x0 = xmm0;
3619   const XMMRegister x1 = xmm1;
3620   const XMMRegister x2 = xmm2;
3621   const XMMRegister x3 = xmm3;
3622
3623   const XMMRegister x4 = xmm4;
3624   const XMMRegister x5 = xmm5;
3625   const XMMRegister x6 = xmm6;
3626   const XMMRegister x7 = xmm7;
3627
3628   const Register tmp = rbx;
3629
3630   BLOCK_COMMENT("Entry:");
3631   __ enter(); // required for proper stackwalking of RuntimeStub frame
3632   __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3633   __ leave(); // required for proper stackwalking of RuntimeStub frame
3634   __ ret(0);
3635
3636   return start;
3637
3638 }
3639
3640  // Safefetch stubs.
3641  void generate_safefetch(const char* name, int size, address* entry,
3642                          address* fault_pc, address* continuation_pc) {
3643    // safefetch signatures:
3644    //   int      SafeFetch32(int*      adr, int      errValue);
3645    //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3646
3647    StubCodeMark mark(this, "StubRoutines", name);
3648
3649    // Entry point, pc or function descriptor.
3650    *entry = __ pc();
3651
3652    __ movl(rax, Address(rsp, 0x8));
3653    __ movl(rcx, Address(rsp, 0x4));
3654    // Load *adr into eax, may fault.
3655    *fault_pc = __ pc();
3656    switch (size) {
3657      case 4:
3658        // int32_t
3659        __ movl(rax, Address(rcx, 0));
3660        break;
3661      case 8:
3662        // int64_t
3663        Unimplemented();
3664        break;
3665      default:
3666        ShouldNotReachHere();
3667    }
3668
3669    // Return errValue or *adr.
3670    *continuation_pc = __ pc();
3671    __ ret(0);
3672  }
3673
3674 public:
3675  // Information about frame layout at time of blocking runtime call.
3676  // Note that we only have to preserve callee-saved registers since
3677  // the compilers are responsible for supplying a continuation point
3678  // if they expect all registers to be preserved.
3679  enum layout {
3680    thread_off,    // last_java_sp
3681    arg1_off,
3682    arg2_off,
3683    rbp_off,       // callee saved register
3684    ret_pc,
3685    framesize
3686  };
3687
3688 private:
3689
3690#undef  __
3691#define __ masm->
3692
3693  //------------------------------------------------------------------------------------------------------------------------
3694  // Continuation point for throwing of implicit exceptions that are not handled in
3695  // the current activation. Fabricates an exception oop and initiates normal
3696  // exception dispatching in this frame.
3697  //
3698  // Previously the compiler (c2) allowed for callee save registers on Java calls.
3699  // This is no longer true after adapter frames were removed but could possibly
3700  // be brought back in the future if the interpreter code was reworked and it
3701  // was deemed worthwhile. The comment below was left to describe what must
3702  // happen here if callee saves were resurrected. As it stands now this stub
3703  // could actually be a vanilla BufferBlob and have now oopMap at all.
3704  // Since it doesn't make much difference we've chosen to leave it the
3705  // way it was in the callee save days and keep the comment.
3706
3707  // If we need to preserve callee-saved values we need a callee-saved oop map and
3708  // therefore have to make these stubs into RuntimeStubs rather than BufferBlobs.
3709  // If the compiler needs all registers to be preserved between the fault
3710  // point and the exception handler then it must assume responsibility for that in
3711  // AbstractCompiler::continuation_for_implicit_null_exception or
3712  // continuation_for_implicit_division_by_zero_exception. All other implicit
3713  // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
3714  // either at call sites or otherwise assume that stack unwinding will be initiated,
3715  // so caller saved registers were assumed volatile in the compiler.
3716  address generate_throw_exception(const char* name, address runtime_entry,
3717                                   Register arg1 = noreg, Register arg2 = noreg) {
3718
3719    int insts_size = 256;
3720    int locs_size  = 32;
3721
3722    CodeBuffer code(name, insts_size, locs_size);
3723    OopMapSet* oop_maps  = new OopMapSet();
3724    MacroAssembler* masm = new MacroAssembler(&code);
3725
3726    address start = __ pc();
3727
3728    // This is an inlined and slightly modified version of call_VM
3729    // which has the ability to fetch the return PC out of
3730    // thread-local storage and also sets up last_Java_sp slightly
3731    // differently than the real call_VM
3732    Register java_thread = rbx;
3733    __ get_thread(java_thread);
3734
3735    __ enter(); // required for proper stackwalking of RuntimeStub frame
3736
3737    // pc and rbp, already pushed
3738    __ subptr(rsp, (framesize-2) * wordSize); // prolog
3739
3740    // Frame is now completed as far as size and linkage.
3741
3742    int frame_complete = __ pc() - start;
3743
3744    // push java thread (becomes first argument of C function)
3745    __ movptr(Address(rsp, thread_off * wordSize), java_thread);
3746    if (arg1 != noreg) {
3747      __ movptr(Address(rsp, arg1_off * wordSize), arg1);
3748    }
3749    if (arg2 != noreg) {
3750      assert(arg1 != noreg, "missing reg arg");
3751      __ movptr(Address(rsp, arg2_off * wordSize), arg2);
3752    }
3753
3754    // Set up last_Java_sp and last_Java_fp
3755    __ set_last_Java_frame(java_thread, rsp, rbp, NULL);
3756
3757    // Call runtime
3758    BLOCK_COMMENT("call runtime_entry");
3759    __ call(RuntimeAddress(runtime_entry));
3760    // Generate oop map
3761    OopMap* map =  new OopMap(framesize, 0);
3762    oop_maps->add_gc_map(__ pc() - start, map);
3763
3764    // restore the thread (cannot use the pushed argument since arguments
3765    // may be overwritten by C code generated by an optimizing compiler);
3766    // however can use the register value directly if it is callee saved.
3767    __ get_thread(java_thread);
3768
3769    __ reset_last_Java_frame(java_thread, true);
3770
3771    __ leave(); // required for proper stackwalking of RuntimeStub frame
3772
3773    // check for pending exceptions
3774#ifdef ASSERT
3775    Label L;
3776    __ cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3777    __ jcc(Assembler::notEqual, L);
3778    __ should_not_reach_here();
3779    __ bind(L);
3780#endif /* ASSERT */
3781    __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3782
3783
3784    RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, framesize, oop_maps, false);
3785    return stub->entry_point();
3786  }
3787
3788
3789  void create_control_words() {
3790    // Round to nearest, 53-bit mode, exceptions masked
3791    StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
3792    // Round to zero, 53-bit mode, exception mased
3793    StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
3794    // Round to nearest, 24-bit mode, exceptions masked
3795    StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
3796    // Round to nearest, 64-bit mode, exceptions masked
3797    StubRoutines::_fpu_cntrl_wrd_64    = 0x037F;
3798    // Round to nearest, 64-bit mode, exceptions masked
3799    StubRoutines::_mxcsr_std           = 0x1F80;
3800    // Note: the following two constants are 80-bit values
3801    //       layout is critical for correct loading by FPU.
3802    // Bias for strict fp multiply/divide
3803    StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
3804    StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
3805    StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
3806    // Un-Bias for strict fp multiply/divide
3807    StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
3808    StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
3809    StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
3810  }
3811
3812  //---------------------------------------------------------------------------
3813  // Initialization
3814
3815  void generate_initial() {
3816    // Generates all stubs and initializes the entry points
3817
3818    //------------------------------------------------------------------------------------------------------------------------
3819    // entry points that exist in all platforms
3820    // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
3821    //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
3822    StubRoutines::_forward_exception_entry      = generate_forward_exception();
3823
3824    StubRoutines::_call_stub_entry              =
3825      generate_call_stub(StubRoutines::_call_stub_return_address);
3826    // is referenced by megamorphic call
3827    StubRoutines::_catch_exception_entry        = generate_catch_exception();
3828
3829    // These are currently used by Solaris/Intel
3830    StubRoutines::_atomic_xchg_entry            = generate_atomic_xchg();
3831
3832    // platform dependent
3833    create_control_words();
3834
3835    StubRoutines::x86::_verify_mxcsr_entry                 = generate_verify_mxcsr();
3836    StubRoutines::x86::_verify_fpu_cntrl_wrd_entry         = generate_verify_fpu_cntrl_wrd();
3837    StubRoutines::_d2i_wrapper                              = generate_d2i_wrapper(T_INT,
3838                                                                                   CAST_FROM_FN_PTR(address, SharedRuntime::d2i));
3839    StubRoutines::_d2l_wrapper                              = generate_d2i_wrapper(T_LONG,
3840                                                                                   CAST_FROM_FN_PTR(address, SharedRuntime::d2l));
3841
3842    // Build this early so it's available for the interpreter
3843    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",
3844                                                                                      CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
3845    StubRoutines::_throw_delayed_StackOverflowError_entry  = generate_throw_exception("delayed StackOverflowError throw_exception",
3846                                                                                      CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError));
3847
3848    if (UseCRC32Intrinsics) {
3849      // set table address before stub generation which use it
3850      StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
3851      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
3852    }
3853
3854    if (UseCRC32CIntrinsics) {
3855      bool supports_clmul = VM_Version::supports_clmul();
3856      StubRoutines::x86::generate_CRC32C_table(supports_clmul);
3857      StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
3858      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
3859    }
3860    if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
3861      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
3862          vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
3863          vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
3864        StubRoutines::x86::_L_2il0floatpacket_0_adr = (address)StubRoutines::x86::_L_2il0floatpacket_0;
3865        StubRoutines::x86::_Pi4Inv_adr = (address)StubRoutines::x86::_Pi4Inv;
3866        StubRoutines::x86::_Pi4x3_adr = (address)StubRoutines::x86::_Pi4x3;
3867        StubRoutines::x86::_Pi4x4_adr = (address)StubRoutines::x86::_Pi4x4;
3868        StubRoutines::x86::_ones_adr = (address)StubRoutines::x86::_ones;
3869      }
3870      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
3871        StubRoutines::_dexp = generate_libmExp();
3872      }
3873      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
3874        StubRoutines::_dlog = generate_libmLog();
3875      }
3876      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
3877        StubRoutines::_dlog10 = generate_libmLog10();
3878      }
3879      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
3880        StubRoutines::_dpow = generate_libmPow();
3881      }
3882      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
3883        vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
3884        vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
3885        StubRoutines::_dlibm_reduce_pi04l = generate_libm_reduce_pi04l();
3886      }
3887      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
3888        vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
3889        StubRoutines::_dlibm_sin_cos_huge = generate_libm_sin_cos_huge();
3890      }
3891      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
3892        StubRoutines::_dsin = generate_libmSin();
3893      }
3894      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
3895        StubRoutines::_dcos = generate_libmCos();
3896      }
3897      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
3898        StubRoutines::_dlibm_tan_cot_huge = generate_libm_tan_cot_huge();
3899        StubRoutines::_dtan = generate_libmTan();
3900      }
3901    }
3902  }
3903
3904  void generate_all() {
3905    // Generates all stubs and initializes the entry points
3906
3907    // These entry points require SharedInfo::stack0 to be set up in non-core builds
3908    // and need to be relocatable, so they each fabricate a RuntimeStub internally.
3909    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
3910    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
3911    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
3912
3913    //------------------------------------------------------------------------------------------------------------------------
3914    // entry points that are platform specific
3915
3916    // support for verify_oop (must happen after universe_init)
3917    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
3918
3919    // arraycopy stubs used by compilers
3920    generate_arraycopy_stubs();
3921
3922    // don't bother generating these AES intrinsic stubs unless global flag is set
3923    if (UseAESIntrinsics) {
3924      StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // might be needed by the others
3925
3926      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
3927      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
3928      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
3929      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
3930    }
3931
3932    if (UseAESCTRIntrinsics) {
3933      StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
3934      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
3935    }
3936
3937    if (UseSHA1Intrinsics) {
3938      StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
3939      StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
3940      StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
3941      StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
3942    }
3943    if (UseSHA256Intrinsics) {
3944      StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
3945      StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
3946      StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
3947      StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
3948    }
3949
3950    // Generate GHASH intrinsics code
3951    if (UseGHASHIntrinsics) {
3952      StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
3953      StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
3954      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
3955    }
3956
3957    // Safefetch stubs.
3958    generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
3959                                                   &StubRoutines::_safefetch32_fault_pc,
3960                                                   &StubRoutines::_safefetch32_continuation_pc);
3961    StubRoutines::_safefetchN_entry           = StubRoutines::_safefetch32_entry;
3962    StubRoutines::_safefetchN_fault_pc        = StubRoutines::_safefetch32_fault_pc;
3963    StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc;
3964  }
3965
3966
3967 public:
3968  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3969    if (all) {
3970      generate_all();
3971    } else {
3972      generate_initial();
3973    }
3974  }
3975}; // end class declaration
3976
3977
3978void StubGenerator_generate(CodeBuffer* code, bool all) {
3979  StubGenerator g(code, all);
3980}
3981