1/*
2 * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "asm/macroAssembler.hpp"
27#include "asm/macroAssembler.inline.hpp"
28#include "interpreter/interpreter.hpp"
29#include "nativeInst_x86.hpp"
30#include "oops/instanceOop.hpp"
31#include "oops/method.hpp"
32#include "oops/objArrayKlass.hpp"
33#include "oops/oop.inline.hpp"
34#include "prims/methodHandles.hpp"
35#include "runtime/frame.inline.hpp"
36#include "runtime/handles.inline.hpp"
37#include "runtime/sharedRuntime.hpp"
38#include "runtime/stubCodeGenerator.hpp"
39#include "runtime/stubRoutines.hpp"
40#include "runtime/thread.inline.hpp"
41#ifdef COMPILER2
42#include "opto/runtime.hpp"
43#endif
44
45// Declaration and definition of StubGenerator (no .hpp file).
46// For a more detailed description of the stub routine structure
47// see the comment in stubRoutines.hpp
48
49#define __ _masm->
50#define a__ ((Assembler*)_masm)->
51
52#ifdef PRODUCT
53#define BLOCK_COMMENT(str) /* nothing */
54#else
55#define BLOCK_COMMENT(str) __ block_comment(str)
56#endif
57
58#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
59
60const int MXCSR_MASK  = 0xFFC0;  // Mask out any pending exceptions
61const int FPU_CNTRL_WRD_MASK = 0xFFFF;
62
63// -------------------------------------------------------------------------------------------------------------------------
64// Stub Code definitions
65
66class StubGenerator: public StubCodeGenerator {
67 private:
68
69#ifdef PRODUCT
70#define inc_counter_np(counter) ((void)0)
71#else
72  void inc_counter_np_(int& counter) {
73    __ incrementl(ExternalAddress((address)&counter));
74  }
75#define inc_counter_np(counter) \
76  BLOCK_COMMENT("inc_counter " #counter); \
77  inc_counter_np_(counter);
78#endif //PRODUCT
79
80  void inc_copy_counter_np(BasicType t) {
81#ifndef PRODUCT
82    switch (t) {
83    case T_BYTE:    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); return;
84    case T_SHORT:   inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); return;
85    case T_INT:     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); return;
86    case T_LONG:    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); return;
87    case T_OBJECT:  inc_counter_np(SharedRuntime::_oop_array_copy_ctr); return;
88    }
89    ShouldNotReachHere();
90#endif //PRODUCT
91  }
92
93  //------------------------------------------------------------------------------------------------------------------------
94  // Call stubs are used to call Java from C
95  //
96  //    [ return_from_Java     ] <--- rsp
97  //    [ argument word n      ]
98  //      ...
99  // -N [ argument word 1      ]
100  // -7 [ Possible padding for stack alignment ]
101  // -6 [ Possible padding for stack alignment ]
102  // -5 [ Possible padding for stack alignment ]
103  // -4 [ mxcsr save           ] <--- rsp_after_call
104  // -3 [ saved rbx,            ]
105  // -2 [ saved rsi            ]
106  // -1 [ saved rdi            ]
107  //  0 [ saved rbp,            ] <--- rbp,
108  //  1 [ return address       ]
109  //  2 [ ptr. to call wrapper ]
110  //  3 [ result               ]
111  //  4 [ result_type          ]
112  //  5 [ method               ]
113  //  6 [ entry_point          ]
114  //  7 [ parameters           ]
115  //  8 [ parameter_size       ]
116  //  9 [ thread               ]
117
118
119  address generate_call_stub(address& return_address) {
120    StubCodeMark mark(this, "StubRoutines", "call_stub");
121    address start = __ pc();
122
123    // stub code parameters / addresses
124    assert(frame::entry_frame_call_wrapper_offset == 2, "adjust this code");
125    bool  sse_save = false;
126    const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_catch_exception()!
127    const int     locals_count_in_bytes  (4*wordSize);
128    const Address mxcsr_save    (rbp, -4 * wordSize);
129    const Address saved_rbx     (rbp, -3 * wordSize);
130    const Address saved_rsi     (rbp, -2 * wordSize);
131    const Address saved_rdi     (rbp, -1 * wordSize);
132    const Address result        (rbp,  3 * wordSize);
133    const Address result_type   (rbp,  4 * wordSize);
134    const Address method        (rbp,  5 * wordSize);
135    const Address entry_point   (rbp,  6 * wordSize);
136    const Address parameters    (rbp,  7 * wordSize);
137    const Address parameter_size(rbp,  8 * wordSize);
138    const Address thread        (rbp,  9 * wordSize); // same as in generate_catch_exception()!
139    sse_save =  UseSSE > 0;
140
141    // stub code
142    __ enter();
143    __ movptr(rcx, parameter_size);              // parameter counter
144    __ shlptr(rcx, Interpreter::logStackElementSize); // convert parameter count to bytes
145    __ addptr(rcx, locals_count_in_bytes);       // reserve space for register saves
146    __ subptr(rsp, rcx);
147    __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
148
149    // save rdi, rsi, & rbx, according to C calling conventions
150    __ movptr(saved_rdi, rdi);
151    __ movptr(saved_rsi, rsi);
152    __ movptr(saved_rbx, rbx);
153
154    // provide initial value for required masks
155    if (UseAVX > 2) {
156      __ movl(rbx, 0xffff);
157      __ kmovwl(k1, rbx);
158    }
159
160    // save and initialize %mxcsr
161    if (sse_save) {
162      Label skip_ldmx;
163      __ stmxcsr(mxcsr_save);
164      __ movl(rax, mxcsr_save);
165      __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
166      ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
167      __ cmp32(rax, mxcsr_std);
168      __ jcc(Assembler::equal, skip_ldmx);
169      __ ldmxcsr(mxcsr_std);
170      __ bind(skip_ldmx);
171    }
172
173    // make sure the control word is correct.
174    __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
175
176#ifdef ASSERT
177    // make sure we have no pending exceptions
178    { Label L;
179      __ movptr(rcx, thread);
180      __ cmpptr(Address(rcx, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
181      __ jcc(Assembler::equal, L);
182      __ stop("StubRoutines::call_stub: entered with pending exception");
183      __ bind(L);
184    }
185#endif
186
187    // pass parameters if any
188    BLOCK_COMMENT("pass parameters if any");
189    Label parameters_done;
190    __ movl(rcx, parameter_size);  // parameter counter
191    __ testl(rcx, rcx);
192    __ jcc(Assembler::zero, parameters_done);
193
194    // parameter passing loop
195
196    Label loop;
197    // Copy Java parameters in reverse order (receiver last)
198    // Note that the argument order is inverted in the process
199    // source is rdx[rcx: N-1..0]
200    // dest   is rsp[rbx: 0..N-1]
201
202    __ movptr(rdx, parameters);          // parameter pointer
203    __ xorptr(rbx, rbx);
204
205    __ BIND(loop);
206
207    // get parameter
208    __ movptr(rax, Address(rdx, rcx, Interpreter::stackElementScale(), -wordSize));
209    __ movptr(Address(rsp, rbx, Interpreter::stackElementScale(),
210                    Interpreter::expr_offset_in_bytes(0)), rax);          // store parameter
211    __ increment(rbx);
212    __ decrement(rcx);
213    __ jcc(Assembler::notZero, loop);
214
215    // call Java function
216    __ BIND(parameters_done);
217    __ movptr(rbx, method);           // get Method*
218    __ movptr(rax, entry_point);      // get entry_point
219    __ mov(rsi, rsp);                 // set sender sp
220    BLOCK_COMMENT("call Java function");
221    __ call(rax);
222
223    BLOCK_COMMENT("call_stub_return_address:");
224    return_address = __ pc();
225
226#ifdef COMPILER2
227    {
228      Label L_skip;
229      if (UseSSE >= 2) {
230        __ verify_FPU(0, "call_stub_return");
231      } else {
232        for (int i = 1; i < 8; i++) {
233          __ ffree(i);
234        }
235
236        // UseSSE <= 1 so double result should be left on TOS
237        __ movl(rsi, result_type);
238        __ cmpl(rsi, T_DOUBLE);
239        __ jcc(Assembler::equal, L_skip);
240        if (UseSSE == 0) {
241          // UseSSE == 0 so float result should be left on TOS
242          __ cmpl(rsi, T_FLOAT);
243          __ jcc(Assembler::equal, L_skip);
244        }
245        __ ffree(0);
246      }
247      __ BIND(L_skip);
248    }
249#endif // COMPILER2
250
251    // store result depending on type
252    // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
253    __ movptr(rdi, result);
254    Label is_long, is_float, is_double, exit;
255    __ movl(rsi, result_type);
256    __ cmpl(rsi, T_LONG);
257    __ jcc(Assembler::equal, is_long);
258    __ cmpl(rsi, T_FLOAT);
259    __ jcc(Assembler::equal, is_float);
260    __ cmpl(rsi, T_DOUBLE);
261    __ jcc(Assembler::equal, is_double);
262
263    // handle T_INT case
264    __ movl(Address(rdi, 0), rax);
265    __ BIND(exit);
266
267    // check that FPU stack is empty
268    __ verify_FPU(0, "generate_call_stub");
269
270    // pop parameters
271    __ lea(rsp, rsp_after_call);
272
273    // restore %mxcsr
274    if (sse_save) {
275      __ ldmxcsr(mxcsr_save);
276    }
277
278    // restore rdi, rsi and rbx,
279    __ movptr(rbx, saved_rbx);
280    __ movptr(rsi, saved_rsi);
281    __ movptr(rdi, saved_rdi);
282    __ addptr(rsp, 4*wordSize);
283
284    // return
285    __ pop(rbp);
286    __ ret(0);
287
288    // handle return types different from T_INT
289    __ BIND(is_long);
290    __ movl(Address(rdi, 0 * wordSize), rax);
291    __ movl(Address(rdi, 1 * wordSize), rdx);
292    __ jmp(exit);
293
294    __ BIND(is_float);
295    // interpreter uses xmm0 for return values
296    if (UseSSE >= 1) {
297      __ movflt(Address(rdi, 0), xmm0);
298    } else {
299      __ fstp_s(Address(rdi, 0));
300    }
301    __ jmp(exit);
302
303    __ BIND(is_double);
304    // interpreter uses xmm0 for return values
305    if (UseSSE >= 2) {
306      __ movdbl(Address(rdi, 0), xmm0);
307    } else {
308      __ fstp_d(Address(rdi, 0));
309    }
310    __ jmp(exit);
311
312    return start;
313  }
314
315
316  //------------------------------------------------------------------------------------------------------------------------
317  // Return point for a Java call if there's an exception thrown in Java code.
318  // The exception is caught and transformed into a pending exception stored in
319  // JavaThread that can be tested from within the VM.
320  //
321  // Note: Usually the parameters are removed by the callee. In case of an exception
322  //       crossing an activation frame boundary, that is not the case if the callee
323  //       is compiled code => need to setup the rsp.
324  //
325  // rax,: exception oop
326
327  address generate_catch_exception() {
328    StubCodeMark mark(this, "StubRoutines", "catch_exception");
329    const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_call_stub()!
330    const Address thread        (rbp,  9 * wordSize); // same as in generate_call_stub()!
331    address start = __ pc();
332
333    // get thread directly
334    __ movptr(rcx, thread);
335#ifdef ASSERT
336    // verify that threads correspond
337    { Label L;
338      __ get_thread(rbx);
339      __ cmpptr(rbx, rcx);
340      __ jcc(Assembler::equal, L);
341      __ stop("StubRoutines::catch_exception: threads must correspond");
342      __ bind(L);
343    }
344#endif
345    // set pending exception
346    __ verify_oop(rax);
347    __ movptr(Address(rcx, Thread::pending_exception_offset()), rax          );
348    __ lea(Address(rcx, Thread::exception_file_offset   ()),
349           ExternalAddress((address)__FILE__));
350    __ movl(Address(rcx, Thread::exception_line_offset   ()), __LINE__ );
351    // complete return to VM
352    assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
353    __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
354
355    return start;
356  }
357
358
359  //------------------------------------------------------------------------------------------------------------------------
360  // Continuation point for runtime calls returning with a pending exception.
361  // The pending exception check happened in the runtime or native call stub.
362  // The pending exception in Thread is converted into a Java-level exception.
363  //
364  // Contract with Java-level exception handlers:
365  // rax: exception
366  // rdx: throwing pc
367  //
368  // NOTE: At entry of this stub, exception-pc must be on stack !!
369
370  address generate_forward_exception() {
371    StubCodeMark mark(this, "StubRoutines", "forward exception");
372    address start = __ pc();
373    const Register thread = rcx;
374
375    // other registers used in this stub
376    const Register exception_oop = rax;
377    const Register handler_addr  = rbx;
378    const Register exception_pc  = rdx;
379
380    // Upon entry, the sp points to the return address returning into Java
381    // (interpreted or compiled) code; i.e., the return address becomes the
382    // throwing pc.
383    //
384    // Arguments pushed before the runtime call are still on the stack but
385    // the exception handler will reset the stack pointer -> ignore them.
386    // A potential result in registers can be ignored as well.
387
388#ifdef ASSERT
389    // make sure this code is only executed if there is a pending exception
390    { Label L;
391      __ get_thread(thread);
392      __ cmpptr(Address(thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
393      __ jcc(Assembler::notEqual, L);
394      __ stop("StubRoutines::forward exception: no pending exception (1)");
395      __ bind(L);
396    }
397#endif
398
399    // compute exception handler into rbx,
400    __ get_thread(thread);
401    __ movptr(exception_pc, Address(rsp, 0));
402    BLOCK_COMMENT("call exception_handler_for_return_address");
403    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, exception_pc);
404    __ mov(handler_addr, rax);
405
406    // setup rax & rdx, remove return address & clear pending exception
407    __ get_thread(thread);
408    __ pop(exception_pc);
409    __ movptr(exception_oop, Address(thread, Thread::pending_exception_offset()));
410    __ movptr(Address(thread, Thread::pending_exception_offset()), NULL_WORD);
411
412#ifdef ASSERT
413    // make sure exception is set
414    { Label L;
415      __ testptr(exception_oop, exception_oop);
416      __ jcc(Assembler::notEqual, L);
417      __ stop("StubRoutines::forward exception: no pending exception (2)");
418      __ bind(L);
419    }
420#endif
421
422    // Verify that there is really a valid exception in RAX.
423    __ verify_oop(exception_oop);
424
425    // continue at exception handler (return address removed)
426    // rax: exception
427    // rbx: exception handler
428    // rdx: throwing pc
429    __ jmp(handler_addr);
430
431    return start;
432  }
433
434
435  //----------------------------------------------------------------------------------------------------
436  // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest)
437  //
438  // xchg exists as far back as 8086, lock needed for MP only
439  // Stack layout immediately after call:
440  //
441  // 0 [ret addr ] <--- rsp
442  // 1 [  ex     ]
443  // 2 [  dest   ]
444  //
445  // Result:   *dest <- ex, return (old *dest)
446  //
447  // Note: win32 does not currently use this code
448
449  address generate_atomic_xchg() {
450    StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
451    address start = __ pc();
452
453    __ push(rdx);
454    Address exchange(rsp, 2 * wordSize);
455    Address dest_addr(rsp, 3 * wordSize);
456    __ movl(rax, exchange);
457    __ movptr(rdx, dest_addr);
458    __ xchgl(rax, Address(rdx, 0));
459    __ pop(rdx);
460    __ ret(0);
461
462    return start;
463  }
464
465  //----------------------------------------------------------------------------------------------------
466  // Support for void verify_mxcsr()
467  //
468  // This routine is used with -Xcheck:jni to verify that native
469  // JNI code does not return to Java code without restoring the
470  // MXCSR register to our expected state.
471
472
473  address generate_verify_mxcsr() {
474    StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
475    address start = __ pc();
476
477    const Address mxcsr_save(rsp, 0);
478
479    if (CheckJNICalls && UseSSE > 0 ) {
480      Label ok_ret;
481      ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
482      __ push(rax);
483      __ subptr(rsp, wordSize);      // allocate a temp location
484      __ stmxcsr(mxcsr_save);
485      __ movl(rax, mxcsr_save);
486      __ andl(rax, MXCSR_MASK);
487      __ cmp32(rax, mxcsr_std);
488      __ jcc(Assembler::equal, ok_ret);
489
490      __ warn("MXCSR changed by native JNI code.");
491
492      __ ldmxcsr(mxcsr_std);
493
494      __ bind(ok_ret);
495      __ addptr(rsp, wordSize);
496      __ pop(rax);
497    }
498
499    __ ret(0);
500
501    return start;
502  }
503
504
505  //---------------------------------------------------------------------------
506  // Support for void verify_fpu_cntrl_wrd()
507  //
508  // This routine is used with -Xcheck:jni to verify that native
509  // JNI code does not return to Java code without restoring the
510  // FP control word to our expected state.
511
512  address generate_verify_fpu_cntrl_wrd() {
513    StubCodeMark mark(this, "StubRoutines", "verify_spcw");
514    address start = __ pc();
515
516    const Address fpu_cntrl_wrd_save(rsp, 0);
517
518    if (CheckJNICalls) {
519      Label ok_ret;
520      __ push(rax);
521      __ subptr(rsp, wordSize);      // allocate a temp location
522      __ fnstcw(fpu_cntrl_wrd_save);
523      __ movl(rax, fpu_cntrl_wrd_save);
524      __ andl(rax, FPU_CNTRL_WRD_MASK);
525      ExternalAddress fpu_std(StubRoutines::addr_fpu_cntrl_wrd_std());
526      __ cmp32(rax, fpu_std);
527      __ jcc(Assembler::equal, ok_ret);
528
529      __ warn("Floating point control word changed by native JNI code.");
530
531      __ fldcw(fpu_std);
532
533      __ bind(ok_ret);
534      __ addptr(rsp, wordSize);
535      __ pop(rax);
536    }
537
538    __ ret(0);
539
540    return start;
541  }
542
543  //---------------------------------------------------------------------------
544  // Wrapper for slow-case handling of double-to-integer conversion
545  // d2i or f2i fast case failed either because it is nan or because
546  // of under/overflow.
547  // Input:  FPU TOS: float value
548  // Output: rax, (rdx): integer (long) result
549
550  address generate_d2i_wrapper(BasicType t, address fcn) {
551    StubCodeMark mark(this, "StubRoutines", "d2i_wrapper");
552    address start = __ pc();
553
554  // Capture info about frame layout
555  enum layout { FPUState_off         = 0,
556                rbp_off              = FPUStateSizeInWords,
557                rdi_off,
558                rsi_off,
559                rcx_off,
560                rbx_off,
561                saved_argument_off,
562                saved_argument_off2, // 2nd half of double
563                framesize
564  };
565
566  assert(FPUStateSizeInWords == 27, "update stack layout");
567
568    // Save outgoing argument to stack across push_FPU_state()
569    __ subptr(rsp, wordSize * 2);
570    __ fstp_d(Address(rsp, 0));
571
572    // Save CPU & FPU state
573    __ push(rbx);
574    __ push(rcx);
575    __ push(rsi);
576    __ push(rdi);
577    __ push(rbp);
578    __ push_FPU_state();
579
580    // push_FPU_state() resets the FP top of stack
581    // Load original double into FP top of stack
582    __ fld_d(Address(rsp, saved_argument_off * wordSize));
583    // Store double into stack as outgoing argument
584    __ subptr(rsp, wordSize*2);
585    __ fst_d(Address(rsp, 0));
586
587    // Prepare FPU for doing math in C-land
588    __ empty_FPU_stack();
589    // Call the C code to massage the double.  Result in EAX
590    if (t == T_INT)
591      { BLOCK_COMMENT("SharedRuntime::d2i"); }
592    else if (t == T_LONG)
593      { BLOCK_COMMENT("SharedRuntime::d2l"); }
594    __ call_VM_leaf( fcn, 2 );
595
596    // Restore CPU & FPU state
597    __ pop_FPU_state();
598    __ pop(rbp);
599    __ pop(rdi);
600    __ pop(rsi);
601    __ pop(rcx);
602    __ pop(rbx);
603    __ addptr(rsp, wordSize * 2);
604
605    __ ret(0);
606
607    return start;
608  }
609
610
611  //----------------------------------------------------------------------------------------------------
612  // Non-destructive plausibility checks for oops
613
614  address generate_verify_oop() {
615    StubCodeMark mark(this, "StubRoutines", "verify_oop");
616    address start = __ pc();
617
618    // Incoming arguments on stack after saving rax,:
619    //
620    // [tos    ]: saved rdx
621    // [tos + 1]: saved EFLAGS
622    // [tos + 2]: return address
623    // [tos + 3]: char* error message
624    // [tos + 4]: oop   object to verify
625    // [tos + 5]: saved rax, - saved by caller and bashed
626
627    Label exit, error;
628    __ pushf();
629    __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
630    __ push(rdx);                                // save rdx
631    // make sure object is 'reasonable'
632    __ movptr(rax, Address(rsp, 4 * wordSize));    // get object
633    __ testptr(rax, rax);
634    __ jcc(Assembler::zero, exit);               // if obj is NULL it is ok
635
636    // Check if the oop is in the right area of memory
637    const int oop_mask = Universe::verify_oop_mask();
638    const int oop_bits = Universe::verify_oop_bits();
639    __ mov(rdx, rax);
640    __ andptr(rdx, oop_mask);
641    __ cmpptr(rdx, oop_bits);
642    __ jcc(Assembler::notZero, error);
643
644    // make sure klass is 'reasonable', which is not zero.
645    __ movptr(rax, Address(rax, oopDesc::klass_offset_in_bytes())); // get klass
646    __ testptr(rax, rax);
647    __ jcc(Assembler::zero, error);              // if klass is NULL it is broken
648
649    // return if everything seems ok
650    __ bind(exit);
651    __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
652    __ pop(rdx);                                 // restore rdx
653    __ popf();                                   // restore EFLAGS
654    __ ret(3 * wordSize);                        // pop arguments
655
656    // handle errors
657    __ bind(error);
658    __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
659    __ pop(rdx);                                 // get saved rdx back
660    __ popf();                                   // get saved EFLAGS off stack -- will be ignored
661    __ pusha();                                  // push registers (eip = return address & msg are already pushed)
662    BLOCK_COMMENT("call MacroAssembler::debug");
663    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
664    __ popa();
665    __ ret(3 * wordSize);                        // pop arguments
666    return start;
667  }
668
669  //
670  //  Generate pre-barrier for array stores
671  //
672  //  Input:
673  //     start   -  starting address
674  //     count   -  element count
675  void  gen_write_ref_array_pre_barrier(Register start, Register count, bool uninitialized_target) {
676    assert_different_registers(start, count);
677    BarrierSet* bs = Universe::heap()->barrier_set();
678    switch (bs->kind()) {
679      case BarrierSet::G1SATBCTLogging:
680        // With G1, don't generate the call if we statically know that the target in uninitialized
681        if (!uninitialized_target) {
682           __ pusha();                      // push registers
683           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre),
684                           start, count);
685           __ popa();
686         }
687        break;
688      case BarrierSet::CardTableForRS:
689      case BarrierSet::CardTableExtension:
690      case BarrierSet::ModRef:
691        break;
692      default      :
693        ShouldNotReachHere();
694
695    }
696  }
697
698
699  //
700  // Generate a post-barrier for an array store
701  //
702  //     start    -  starting address
703  //     count    -  element count
704  //
705  //  The two input registers are overwritten.
706  //
707  void  gen_write_ref_array_post_barrier(Register start, Register count) {
708    BarrierSet* bs = Universe::heap()->barrier_set();
709    assert_different_registers(start, count);
710    switch (bs->kind()) {
711      case BarrierSet::G1SATBCTLogging:
712        {
713          __ pusha();                      // push registers
714          __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post),
715                          start, count);
716          __ popa();
717        }
718        break;
719
720      case BarrierSet::CardTableForRS:
721      case BarrierSet::CardTableExtension:
722        {
723          CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
724          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
725
726          Label L_loop;
727          const Register end = count;  // elements count; end == start+count-1
728          assert_different_registers(start, end);
729
730          __ lea(end,  Address(start, count, Address::times_ptr, -wordSize));
731          __ shrptr(start, CardTableModRefBS::card_shift);
732          __ shrptr(end,   CardTableModRefBS::card_shift);
733          __ subptr(end, start); // end --> count
734        __ BIND(L_loop);
735          intptr_t disp = (intptr_t) ct->byte_map_base;
736          Address cardtable(start, count, Address::times_1, disp);
737          __ movb(cardtable, 0);
738          __ decrement(count);
739          __ jcc(Assembler::greaterEqual, L_loop);
740        }
741        break;
742      case BarrierSet::ModRef:
743        break;
744      default      :
745        ShouldNotReachHere();
746
747    }
748  }
749
750
751  // Copy 64 bytes chunks
752  //
753  // Inputs:
754  //   from        - source array address
755  //   to_from     - destination array address - from
756  //   qword_count - 8-bytes element count, negative
757  //
758  void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
759    assert( UseSSE >= 2, "supported cpu only" );
760    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
761    if (UseAVX > 2) {
762      __ push(rbx);
763      __ movl(rbx, 0xffff);
764      __ kmovwl(k1, rbx);
765      __ pop(rbx);
766    }
767    // Copy 64-byte chunks
768    __ jmpb(L_copy_64_bytes);
769    __ align(OptoLoopAlignment);
770  __ BIND(L_copy_64_bytes_loop);
771
772    if (UseUnalignedLoadStores) {
773      if (UseAVX > 2) {
774        __ evmovdqul(xmm0, Address(from, 0), Assembler::AVX_512bit);
775        __ evmovdqul(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
776      } else if (UseAVX == 2) {
777        __ vmovdqu(xmm0, Address(from,  0));
778        __ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
779        __ vmovdqu(xmm1, Address(from, 32));
780        __ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1);
781      } else {
782        __ movdqu(xmm0, Address(from, 0));
783        __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
784        __ movdqu(xmm1, Address(from, 16));
785        __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
786        __ movdqu(xmm2, Address(from, 32));
787        __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
788        __ movdqu(xmm3, Address(from, 48));
789        __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
790      }
791    } else {
792      __ movq(xmm0, Address(from, 0));
793      __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
794      __ movq(xmm1, Address(from, 8));
795      __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
796      __ movq(xmm2, Address(from, 16));
797      __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
798      __ movq(xmm3, Address(from, 24));
799      __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
800      __ movq(xmm4, Address(from, 32));
801      __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
802      __ movq(xmm5, Address(from, 40));
803      __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
804      __ movq(xmm6, Address(from, 48));
805      __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
806      __ movq(xmm7, Address(from, 56));
807      __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
808    }
809
810    __ addl(from, 64);
811  __ BIND(L_copy_64_bytes);
812    __ subl(qword_count, 8);
813    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
814
815    if (UseUnalignedLoadStores && (UseAVX == 2)) {
816      // clean upper bits of YMM registers
817      __ vpxor(xmm0, xmm0);
818      __ vpxor(xmm1, xmm1);
819    }
820    __ addl(qword_count, 8);
821    __ jccb(Assembler::zero, L_exit);
822    //
823    // length is too short, just copy qwords
824    //
825  __ BIND(L_copy_8_bytes);
826    __ movq(xmm0, Address(from, 0));
827    __ movq(Address(from, to_from, Address::times_1), xmm0);
828    __ addl(from, 8);
829    __ decrement(qword_count);
830    __ jcc(Assembler::greater, L_copy_8_bytes);
831  __ BIND(L_exit);
832  }
833
834  // Copy 64 bytes chunks
835  //
836  // Inputs:
837  //   from        - source array address
838  //   to_from     - destination array address - from
839  //   qword_count - 8-bytes element count, negative
840  //
841  void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
842    assert( VM_Version::supports_mmx(), "supported cpu only" );
843    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
844    // Copy 64-byte chunks
845    __ jmpb(L_copy_64_bytes);
846    __ align(OptoLoopAlignment);
847  __ BIND(L_copy_64_bytes_loop);
848    __ movq(mmx0, Address(from, 0));
849    __ movq(mmx1, Address(from, 8));
850    __ movq(mmx2, Address(from, 16));
851    __ movq(Address(from, to_from, Address::times_1, 0), mmx0);
852    __ movq(mmx3, Address(from, 24));
853    __ movq(Address(from, to_from, Address::times_1, 8), mmx1);
854    __ movq(mmx4, Address(from, 32));
855    __ movq(Address(from, to_from, Address::times_1, 16), mmx2);
856    __ movq(mmx5, Address(from, 40));
857    __ movq(Address(from, to_from, Address::times_1, 24), mmx3);
858    __ movq(mmx6, Address(from, 48));
859    __ movq(Address(from, to_from, Address::times_1, 32), mmx4);
860    __ movq(mmx7, Address(from, 56));
861    __ movq(Address(from, to_from, Address::times_1, 40), mmx5);
862    __ movq(Address(from, to_from, Address::times_1, 48), mmx6);
863    __ movq(Address(from, to_from, Address::times_1, 56), mmx7);
864    __ addptr(from, 64);
865  __ BIND(L_copy_64_bytes);
866    __ subl(qword_count, 8);
867    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
868    __ addl(qword_count, 8);
869    __ jccb(Assembler::zero, L_exit);
870    //
871    // length is too short, just copy qwords
872    //
873  __ BIND(L_copy_8_bytes);
874    __ movq(mmx0, Address(from, 0));
875    __ movq(Address(from, to_from, Address::times_1), mmx0);
876    __ addptr(from, 8);
877    __ decrement(qword_count);
878    __ jcc(Assembler::greater, L_copy_8_bytes);
879  __ BIND(L_exit);
880    __ emms();
881  }
882
883  address generate_disjoint_copy(BasicType t, bool aligned,
884                                 Address::ScaleFactor sf,
885                                 address* entry, const char *name,
886                                 bool dest_uninitialized = false) {
887    __ align(CodeEntryAlignment);
888    StubCodeMark mark(this, "StubRoutines", name);
889    address start = __ pc();
890
891    Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
892    Label L_copy_2_bytes, L_copy_4_bytes, L_copy_64_bytes;
893
894    int shift = Address::times_ptr - sf;
895
896    const Register from     = rsi;  // source array address
897    const Register to       = rdi;  // destination array address
898    const Register count    = rcx;  // elements count
899    const Register to_from  = to;   // (to - from)
900    const Register saved_to = rdx;  // saved destination array address
901
902    __ enter(); // required for proper stackwalking of RuntimeStub frame
903    __ push(rsi);
904    __ push(rdi);
905    __ movptr(from , Address(rsp, 12+ 4));
906    __ movptr(to   , Address(rsp, 12+ 8));
907    __ movl(count, Address(rsp, 12+ 12));
908
909    if (entry != NULL) {
910      *entry = __ pc(); // Entry point from conjoint arraycopy stub.
911      BLOCK_COMMENT("Entry:");
912    }
913
914    if (t == T_OBJECT) {
915      __ testl(count, count);
916      __ jcc(Assembler::zero, L_0_count);
917      gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
918      __ mov(saved_to, to);          // save 'to'
919    }
920
921    __ subptr(to, from); // to --> to_from
922    __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
923    __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
924    if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
925      // align source address at 4 bytes address boundary
926      if (t == T_BYTE) {
927        // One byte misalignment happens only for byte arrays
928        __ testl(from, 1);
929        __ jccb(Assembler::zero, L_skip_align1);
930        __ movb(rax, Address(from, 0));
931        __ movb(Address(from, to_from, Address::times_1, 0), rax);
932        __ increment(from);
933        __ decrement(count);
934      __ BIND(L_skip_align1);
935      }
936      // Two bytes misalignment happens only for byte and short (char) arrays
937      __ testl(from, 2);
938      __ jccb(Assembler::zero, L_skip_align2);
939      __ movw(rax, Address(from, 0));
940      __ movw(Address(from, to_from, Address::times_1, 0), rax);
941      __ addptr(from, 2);
942      __ subl(count, 1<<(shift-1));
943    __ BIND(L_skip_align2);
944    }
945    if (!VM_Version::supports_mmx()) {
946      __ mov(rax, count);      // save 'count'
947      __ shrl(count, shift); // bytes count
948      __ addptr(to_from, from);// restore 'to'
949      __ rep_mov();
950      __ subptr(to_from, from);// restore 'to_from'
951      __ mov(count, rax);      // restore 'count'
952      __ jmpb(L_copy_2_bytes); // all dwords were copied
953    } else {
954      if (!UseUnalignedLoadStores) {
955        // align to 8 bytes, we know we are 4 byte aligned to start
956        __ testptr(from, 4);
957        __ jccb(Assembler::zero, L_copy_64_bytes);
958        __ movl(rax, Address(from, 0));
959        __ movl(Address(from, to_from, Address::times_1, 0), rax);
960        __ addptr(from, 4);
961        __ subl(count, 1<<shift);
962      }
963    __ BIND(L_copy_64_bytes);
964      __ mov(rax, count);
965      __ shrl(rax, shift+1);  // 8 bytes chunk count
966      //
967      // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop
968      //
969      if (UseXMMForArrayCopy) {
970        xmm_copy_forward(from, to_from, rax);
971      } else {
972        mmx_copy_forward(from, to_from, rax);
973      }
974    }
975    // copy tailing dword
976  __ BIND(L_copy_4_bytes);
977    __ testl(count, 1<<shift);
978    __ jccb(Assembler::zero, L_copy_2_bytes);
979    __ movl(rax, Address(from, 0));
980    __ movl(Address(from, to_from, Address::times_1, 0), rax);
981    if (t == T_BYTE || t == T_SHORT) {
982      __ addptr(from, 4);
983    __ BIND(L_copy_2_bytes);
984      // copy tailing word
985      __ testl(count, 1<<(shift-1));
986      __ jccb(Assembler::zero, L_copy_byte);
987      __ movw(rax, Address(from, 0));
988      __ movw(Address(from, to_from, Address::times_1, 0), rax);
989      if (t == T_BYTE) {
990        __ addptr(from, 2);
991      __ BIND(L_copy_byte);
992        // copy tailing byte
993        __ testl(count, 1);
994        __ jccb(Assembler::zero, L_exit);
995        __ movb(rax, Address(from, 0));
996        __ movb(Address(from, to_from, Address::times_1, 0), rax);
997      __ BIND(L_exit);
998      } else {
999      __ BIND(L_copy_byte);
1000      }
1001    } else {
1002    __ BIND(L_copy_2_bytes);
1003    }
1004
1005    if (t == T_OBJECT) {
1006      __ movl(count, Address(rsp, 12+12)); // reread 'count'
1007      __ mov(to, saved_to); // restore 'to'
1008      gen_write_ref_array_post_barrier(to, count);
1009    __ BIND(L_0_count);
1010    }
1011    inc_copy_counter_np(t);
1012    __ pop(rdi);
1013    __ pop(rsi);
1014    __ leave(); // required for proper stackwalking of RuntimeStub frame
1015    __ vzeroupper();
1016    __ xorptr(rax, rax); // return 0
1017    __ ret(0);
1018    return start;
1019  }
1020
1021
1022  address generate_fill(BasicType t, bool aligned, const char *name) {
1023    __ align(CodeEntryAlignment);
1024    StubCodeMark mark(this, "StubRoutines", name);
1025    address start = __ pc();
1026
1027    BLOCK_COMMENT("Entry:");
1028
1029    const Register to       = rdi;  // source array address
1030    const Register value    = rdx;  // value
1031    const Register count    = rsi;  // elements count
1032
1033    __ enter(); // required for proper stackwalking of RuntimeStub frame
1034    __ push(rsi);
1035    __ push(rdi);
1036    __ movptr(to   , Address(rsp, 12+ 4));
1037    __ movl(value, Address(rsp, 12+ 8));
1038    __ movl(count, Address(rsp, 12+ 12));
1039
1040    __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1041
1042    __ pop(rdi);
1043    __ pop(rsi);
1044    __ leave(); // required for proper stackwalking of RuntimeStub frame
1045    __ ret(0);
1046    return start;
1047  }
1048
1049  address generate_conjoint_copy(BasicType t, bool aligned,
1050                                 Address::ScaleFactor sf,
1051                                 address nooverlap_target,
1052                                 address* entry, const char *name,
1053                                 bool dest_uninitialized = false) {
1054    __ align(CodeEntryAlignment);
1055    StubCodeMark mark(this, "StubRoutines", name);
1056    address start = __ pc();
1057
1058    Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
1059    Label L_copy_2_bytes, L_copy_4_bytes, L_copy_8_bytes, L_copy_8_bytes_loop;
1060
1061    int shift = Address::times_ptr - sf;
1062
1063    const Register src   = rax;  // source array address
1064    const Register dst   = rdx;  // destination array address
1065    const Register from  = rsi;  // source array address
1066    const Register to    = rdi;  // destination array address
1067    const Register count = rcx;  // elements count
1068    const Register end   = rax;  // array end address
1069
1070    __ enter(); // required for proper stackwalking of RuntimeStub frame
1071    __ push(rsi);
1072    __ push(rdi);
1073    __ movptr(src  , Address(rsp, 12+ 4));   // from
1074    __ movptr(dst  , Address(rsp, 12+ 8));   // to
1075    __ movl2ptr(count, Address(rsp, 12+12)); // count
1076
1077    if (entry != NULL) {
1078      *entry = __ pc(); // Entry point from generic arraycopy stub.
1079      BLOCK_COMMENT("Entry:");
1080    }
1081
1082    // nooverlap_target expects arguments in rsi and rdi.
1083    __ mov(from, src);
1084    __ mov(to  , dst);
1085
1086    // arrays overlap test: dispatch to disjoint stub if necessary.
1087    RuntimeAddress nooverlap(nooverlap_target);
1088    __ cmpptr(dst, src);
1089    __ lea(end, Address(src, count, sf, 0)); // src + count * elem_size
1090    __ jump_cc(Assembler::belowEqual, nooverlap);
1091    __ cmpptr(dst, end);
1092    __ jump_cc(Assembler::aboveEqual, nooverlap);
1093
1094    if (t == T_OBJECT) {
1095      __ testl(count, count);
1096      __ jcc(Assembler::zero, L_0_count);
1097      gen_write_ref_array_pre_barrier(dst, count, dest_uninitialized);
1098    }
1099
1100    // copy from high to low
1101    __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1102    __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
1103    if (t == T_BYTE || t == T_SHORT) {
1104      // Align the end of destination array at 4 bytes address boundary
1105      __ lea(end, Address(dst, count, sf, 0));
1106      if (t == T_BYTE) {
1107        // One byte misalignment happens only for byte arrays
1108        __ testl(end, 1);
1109        __ jccb(Assembler::zero, L_skip_align1);
1110        __ decrement(count);
1111        __ movb(rdx, Address(from, count, sf, 0));
1112        __ movb(Address(to, count, sf, 0), rdx);
1113      __ BIND(L_skip_align1);
1114      }
1115      // Two bytes misalignment happens only for byte and short (char) arrays
1116      __ testl(end, 2);
1117      __ jccb(Assembler::zero, L_skip_align2);
1118      __ subptr(count, 1<<(shift-1));
1119      __ movw(rdx, Address(from, count, sf, 0));
1120      __ movw(Address(to, count, sf, 0), rdx);
1121    __ BIND(L_skip_align2);
1122      __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1123      __ jcc(Assembler::below, L_copy_4_bytes);
1124    }
1125
1126    if (!VM_Version::supports_mmx()) {
1127      __ std();
1128      __ mov(rax, count); // Save 'count'
1129      __ mov(rdx, to);    // Save 'to'
1130      __ lea(rsi, Address(from, count, sf, -4));
1131      __ lea(rdi, Address(to  , count, sf, -4));
1132      __ shrptr(count, shift); // bytes count
1133      __ rep_mov();
1134      __ cld();
1135      __ mov(count, rax); // restore 'count'
1136      __ andl(count, (1<<shift)-1);      // mask the number of rest elements
1137      __ movptr(from, Address(rsp, 12+4)); // reread 'from'
1138      __ mov(to, rdx);   // restore 'to'
1139      __ jmpb(L_copy_2_bytes); // all dword were copied
1140   } else {
1141      // Align to 8 bytes the end of array. It is aligned to 4 bytes already.
1142      __ testptr(end, 4);
1143      __ jccb(Assembler::zero, L_copy_8_bytes);
1144      __ subl(count, 1<<shift);
1145      __ movl(rdx, Address(from, count, sf, 0));
1146      __ movl(Address(to, count, sf, 0), rdx);
1147      __ jmpb(L_copy_8_bytes);
1148
1149      __ align(OptoLoopAlignment);
1150      // Move 8 bytes
1151    __ BIND(L_copy_8_bytes_loop);
1152      if (UseXMMForArrayCopy) {
1153        __ movq(xmm0, Address(from, count, sf, 0));
1154        __ movq(Address(to, count, sf, 0), xmm0);
1155      } else {
1156        __ movq(mmx0, Address(from, count, sf, 0));
1157        __ movq(Address(to, count, sf, 0), mmx0);
1158      }
1159    __ BIND(L_copy_8_bytes);
1160      __ subl(count, 2<<shift);
1161      __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1162      __ addl(count, 2<<shift);
1163      if (!UseXMMForArrayCopy) {
1164        __ emms();
1165      }
1166    }
1167  __ BIND(L_copy_4_bytes);
1168    // copy prefix qword
1169    __ testl(count, 1<<shift);
1170    __ jccb(Assembler::zero, L_copy_2_bytes);
1171    __ movl(rdx, Address(from, count, sf, -4));
1172    __ movl(Address(to, count, sf, -4), rdx);
1173
1174    if (t == T_BYTE || t == T_SHORT) {
1175        __ subl(count, (1<<shift));
1176      __ BIND(L_copy_2_bytes);
1177        // copy prefix dword
1178        __ testl(count, 1<<(shift-1));
1179        __ jccb(Assembler::zero, L_copy_byte);
1180        __ movw(rdx, Address(from, count, sf, -2));
1181        __ movw(Address(to, count, sf, -2), rdx);
1182        if (t == T_BYTE) {
1183          __ subl(count, 1<<(shift-1));
1184        __ BIND(L_copy_byte);
1185          // copy prefix byte
1186          __ testl(count, 1);
1187          __ jccb(Assembler::zero, L_exit);
1188          __ movb(rdx, Address(from, 0));
1189          __ movb(Address(to, 0), rdx);
1190        __ BIND(L_exit);
1191        } else {
1192        __ BIND(L_copy_byte);
1193        }
1194    } else {
1195    __ BIND(L_copy_2_bytes);
1196    }
1197    if (t == T_OBJECT) {
1198      __ movl2ptr(count, Address(rsp, 12+12)); // reread count
1199      gen_write_ref_array_post_barrier(to, count);
1200    __ BIND(L_0_count);
1201    }
1202    inc_copy_counter_np(t);
1203    __ pop(rdi);
1204    __ pop(rsi);
1205    __ leave(); // required for proper stackwalking of RuntimeStub frame
1206    __ xorptr(rax, rax); // return 0
1207    __ ret(0);
1208    return start;
1209  }
1210
1211
1212  address generate_disjoint_long_copy(address* entry, const char *name) {
1213    __ align(CodeEntryAlignment);
1214    StubCodeMark mark(this, "StubRoutines", name);
1215    address start = __ pc();
1216
1217    Label L_copy_8_bytes, L_copy_8_bytes_loop;
1218    const Register from       = rax;  // source array address
1219    const Register to         = rdx;  // destination array address
1220    const Register count      = rcx;  // elements count
1221    const Register to_from    = rdx;  // (to - from)
1222
1223    __ enter(); // required for proper stackwalking of RuntimeStub frame
1224    __ movptr(from , Address(rsp, 8+0));       // from
1225    __ movptr(to   , Address(rsp, 8+4));       // to
1226    __ movl2ptr(count, Address(rsp, 8+8));     // count
1227
1228    *entry = __ pc(); // Entry point from conjoint arraycopy stub.
1229    BLOCK_COMMENT("Entry:");
1230
1231    __ subptr(to, from); // to --> to_from
1232    if (VM_Version::supports_mmx()) {
1233      if (UseXMMForArrayCopy) {
1234        xmm_copy_forward(from, to_from, count);
1235      } else {
1236        mmx_copy_forward(from, to_from, count);
1237      }
1238    } else {
1239      __ jmpb(L_copy_8_bytes);
1240      __ align(OptoLoopAlignment);
1241    __ BIND(L_copy_8_bytes_loop);
1242      __ fild_d(Address(from, 0));
1243      __ fistp_d(Address(from, to_from, Address::times_1));
1244      __ addptr(from, 8);
1245    __ BIND(L_copy_8_bytes);
1246      __ decrement(count);
1247      __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1248    }
1249    inc_copy_counter_np(T_LONG);
1250    __ leave(); // required for proper stackwalking of RuntimeStub frame
1251    __ vzeroupper();
1252    __ xorptr(rax, rax); // return 0
1253    __ ret(0);
1254    return start;
1255  }
1256
1257  address generate_conjoint_long_copy(address nooverlap_target,
1258                                      address* entry, const char *name) {
1259    __ align(CodeEntryAlignment);
1260    StubCodeMark mark(this, "StubRoutines", name);
1261    address start = __ pc();
1262
1263    Label L_copy_8_bytes, L_copy_8_bytes_loop;
1264    const Register from       = rax;  // source array address
1265    const Register to         = rdx;  // destination array address
1266    const Register count      = rcx;  // elements count
1267    const Register end_from   = rax;  // source array end address
1268
1269    __ enter(); // required for proper stackwalking of RuntimeStub frame
1270    __ movptr(from , Address(rsp, 8+0));       // from
1271    __ movptr(to   , Address(rsp, 8+4));       // to
1272    __ movl2ptr(count, Address(rsp, 8+8));     // count
1273
1274    *entry = __ pc(); // Entry point from generic arraycopy stub.
1275    BLOCK_COMMENT("Entry:");
1276
1277    // arrays overlap test
1278    __ cmpptr(to, from);
1279    RuntimeAddress nooverlap(nooverlap_target);
1280    __ jump_cc(Assembler::belowEqual, nooverlap);
1281    __ lea(end_from, Address(from, count, Address::times_8, 0));
1282    __ cmpptr(to, end_from);
1283    __ movptr(from, Address(rsp, 8));  // from
1284    __ jump_cc(Assembler::aboveEqual, nooverlap);
1285
1286    __ jmpb(L_copy_8_bytes);
1287
1288    __ align(OptoLoopAlignment);
1289  __ BIND(L_copy_8_bytes_loop);
1290    if (VM_Version::supports_mmx()) {
1291      if (UseXMMForArrayCopy) {
1292        __ movq(xmm0, Address(from, count, Address::times_8));
1293        __ movq(Address(to, count, Address::times_8), xmm0);
1294      } else {
1295        __ movq(mmx0, Address(from, count, Address::times_8));
1296        __ movq(Address(to, count, Address::times_8), mmx0);
1297      }
1298    } else {
1299      __ fild_d(Address(from, count, Address::times_8));
1300      __ fistp_d(Address(to, count, Address::times_8));
1301    }
1302  __ BIND(L_copy_8_bytes);
1303    __ decrement(count);
1304    __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1305
1306    if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
1307      __ emms();
1308    }
1309    inc_copy_counter_np(T_LONG);
1310    __ leave(); // required for proper stackwalking of RuntimeStub frame
1311    __ xorptr(rax, rax); // return 0
1312    __ ret(0);
1313    return start;
1314  }
1315
1316
1317  // Helper for generating a dynamic type check.
1318  // The sub_klass must be one of {rbx, rdx, rsi}.
1319  // The temp is killed.
1320  void generate_type_check(Register sub_klass,
1321                           Address& super_check_offset_addr,
1322                           Address& super_klass_addr,
1323                           Register temp,
1324                           Label* L_success, Label* L_failure) {
1325    BLOCK_COMMENT("type_check:");
1326
1327    Label L_fallthrough;
1328#define LOCAL_JCC(assembler_con, label_ptr)                             \
1329    if (label_ptr != NULL)  __ jcc(assembler_con, *(label_ptr));        \
1330    else                    __ jcc(assembler_con, L_fallthrough) /*omit semi*/
1331
1332    // The following is a strange variation of the fast path which requires
1333    // one less register, because needed values are on the argument stack.
1334    // __ check_klass_subtype_fast_path(sub_klass, *super_klass*, temp,
1335    //                                  L_success, L_failure, NULL);
1336    assert_different_registers(sub_klass, temp);
1337
1338    int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1339
1340    // if the pointers are equal, we are done (e.g., String[] elements)
1341    __ cmpptr(sub_klass, super_klass_addr);
1342    LOCAL_JCC(Assembler::equal, L_success);
1343
1344    // check the supertype display:
1345    __ movl2ptr(temp, super_check_offset_addr);
1346    Address super_check_addr(sub_klass, temp, Address::times_1, 0);
1347    __ movptr(temp, super_check_addr); // load displayed supertype
1348    __ cmpptr(temp, super_klass_addr); // test the super type
1349    LOCAL_JCC(Assembler::equal, L_success);
1350
1351    // if it was a primary super, we can just fail immediately
1352    __ cmpl(super_check_offset_addr, sc_offset);
1353    LOCAL_JCC(Assembler::notEqual, L_failure);
1354
1355    // The repne_scan instruction uses fixed registers, which will get spilled.
1356    // We happen to know this works best when super_klass is in rax.
1357    Register super_klass = temp;
1358    __ movptr(super_klass, super_klass_addr);
1359    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg,
1360                                     L_success, L_failure);
1361
1362    __ bind(L_fallthrough);
1363
1364    if (L_success == NULL) { BLOCK_COMMENT("L_success:"); }
1365    if (L_failure == NULL) { BLOCK_COMMENT("L_failure:"); }
1366
1367#undef LOCAL_JCC
1368  }
1369
1370  //
1371  //  Generate checkcasting array copy stub
1372  //
1373  //  Input:
1374  //    4(rsp)   - source array address
1375  //    8(rsp)   - destination array address
1376  //   12(rsp)   - element count, can be zero
1377  //   16(rsp)   - size_t ckoff (super_check_offset)
1378  //   20(rsp)   - oop ckval (super_klass)
1379  //
1380  //  Output:
1381  //    rax, ==  0  -  success
1382  //    rax, == -1^K - failure, where K is partial transfer count
1383  //
1384  address generate_checkcast_copy(const char *name, address* entry, bool dest_uninitialized = false) {
1385    __ align(CodeEntryAlignment);
1386    StubCodeMark mark(this, "StubRoutines", name);
1387    address start = __ pc();
1388
1389    Label L_load_element, L_store_element, L_do_card_marks, L_done;
1390
1391    // register use:
1392    //  rax, rdx, rcx -- loop control (end_from, end_to, count)
1393    //  rdi, rsi      -- element access (oop, klass)
1394    //  rbx,           -- temp
1395    const Register from       = rax;    // source array address
1396    const Register to         = rdx;    // destination array address
1397    const Register length     = rcx;    // elements count
1398    const Register elem       = rdi;    // each oop copied
1399    const Register elem_klass = rsi;    // each elem._klass (sub_klass)
1400    const Register temp       = rbx;    // lone remaining temp
1401
1402    __ enter(); // required for proper stackwalking of RuntimeStub frame
1403
1404    __ push(rsi);
1405    __ push(rdi);
1406    __ push(rbx);
1407
1408    Address   from_arg(rsp, 16+ 4);     // from
1409    Address     to_arg(rsp, 16+ 8);     // to
1410    Address length_arg(rsp, 16+12);     // elements count
1411    Address  ckoff_arg(rsp, 16+16);     // super_check_offset
1412    Address  ckval_arg(rsp, 16+20);     // super_klass
1413
1414    // Load up:
1415    __ movptr(from,     from_arg);
1416    __ movptr(to,         to_arg);
1417    __ movl2ptr(length, length_arg);
1418
1419    if (entry != NULL) {
1420      *entry = __ pc(); // Entry point from generic arraycopy stub.
1421      BLOCK_COMMENT("Entry:");
1422    }
1423
1424    //---------------------------------------------------------------
1425    // Assembler stub will be used for this call to arraycopy
1426    // if the two arrays are subtypes of Object[] but the
1427    // destination array type is not equal to or a supertype
1428    // of the source type.  Each element must be separately
1429    // checked.
1430
1431    // Loop-invariant addresses.  They are exclusive end pointers.
1432    Address end_from_addr(from, length, Address::times_ptr, 0);
1433    Address   end_to_addr(to,   length, Address::times_ptr, 0);
1434
1435    Register end_from = from;           // re-use
1436    Register end_to   = to;             // re-use
1437    Register count    = length;         // re-use
1438
1439    // Loop-variant addresses.  They assume post-incremented count < 0.
1440    Address from_element_addr(end_from, count, Address::times_ptr, 0);
1441    Address   to_element_addr(end_to,   count, Address::times_ptr, 0);
1442    Address elem_klass_addr(elem, oopDesc::klass_offset_in_bytes());
1443
1444    // Copy from low to high addresses, indexed from the end of each array.
1445    gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1446    __ lea(end_from, end_from_addr);
1447    __ lea(end_to,   end_to_addr);
1448    assert(length == count, "");        // else fix next line:
1449    __ negptr(count);                   // negate and test the length
1450    __ jccb(Assembler::notZero, L_load_element);
1451
1452    // Empty array:  Nothing to do.
1453    __ xorptr(rax, rax);                  // return 0 on (trivial) success
1454    __ jmp(L_done);
1455
1456    // ======== begin loop ========
1457    // (Loop is rotated; its entry is L_load_element.)
1458    // Loop control:
1459    //   for (count = -count; count != 0; count++)
1460    // Base pointers src, dst are biased by 8*count,to last element.
1461    __ align(OptoLoopAlignment);
1462
1463    __ BIND(L_store_element);
1464    __ movptr(to_element_addr, elem);     // store the oop
1465    __ increment(count);                // increment the count toward zero
1466    __ jccb(Assembler::zero, L_do_card_marks);
1467
1468    // ======== loop entry is here ========
1469    __ BIND(L_load_element);
1470    __ movptr(elem, from_element_addr);   // load the oop
1471    __ testptr(elem, elem);
1472    __ jccb(Assembler::zero, L_store_element);
1473
1474    // (Could do a trick here:  Remember last successful non-null
1475    // element stored and make a quick oop equality check on it.)
1476
1477    __ movptr(elem_klass, elem_klass_addr); // query the object klass
1478    generate_type_check(elem_klass, ckoff_arg, ckval_arg, temp,
1479                        &L_store_element, NULL);
1480    // (On fall-through, we have failed the element type check.)
1481    // ======== end loop ========
1482
1483    // It was a real error; we must depend on the caller to finish the job.
1484    // Register "count" = -1 * number of *remaining* oops, length_arg = *total* oops.
1485    // Emit GC store barriers for the oops we have copied (length_arg + count),
1486    // and report their number to the caller.
1487    assert_different_registers(to, count, rax);
1488    Label L_post_barrier;
1489    __ addl(count, length_arg);         // transfers = (length - remaining)
1490    __ movl2ptr(rax, count);            // save the value
1491    __ notptr(rax);                     // report (-1^K) to caller (does not affect flags)
1492    __ jccb(Assembler::notZero, L_post_barrier);
1493    __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
1494
1495    // Come here on success only.
1496    __ BIND(L_do_card_marks);
1497    __ xorptr(rax, rax);                // return 0 on success
1498    __ movl2ptr(count, length_arg);
1499
1500    __ BIND(L_post_barrier);
1501    __ movptr(to, to_arg);              // reload
1502    gen_write_ref_array_post_barrier(to, count);
1503
1504    // Common exit point (success or failure).
1505    __ BIND(L_done);
1506    __ pop(rbx);
1507    __ pop(rdi);
1508    __ pop(rsi);
1509    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1510    __ leave(); // required for proper stackwalking of RuntimeStub frame
1511    __ ret(0);
1512
1513    return start;
1514  }
1515
1516  //
1517  //  Generate 'unsafe' array copy stub
1518  //  Though just as safe as the other stubs, it takes an unscaled
1519  //  size_t argument instead of an element count.
1520  //
1521  //  Input:
1522  //    4(rsp)   - source array address
1523  //    8(rsp)   - destination array address
1524  //   12(rsp)   - byte count, can be zero
1525  //
1526  //  Output:
1527  //    rax, ==  0  -  success
1528  //    rax, == -1  -  need to call System.arraycopy
1529  //
1530  // Examines the alignment of the operands and dispatches
1531  // to a long, int, short, or byte copy loop.
1532  //
1533  address generate_unsafe_copy(const char *name,
1534                               address byte_copy_entry,
1535                               address short_copy_entry,
1536                               address int_copy_entry,
1537                               address long_copy_entry) {
1538
1539    Label L_long_aligned, L_int_aligned, L_short_aligned;
1540
1541    __ align(CodeEntryAlignment);
1542    StubCodeMark mark(this, "StubRoutines", name);
1543    address start = __ pc();
1544
1545    const Register from       = rax;  // source array address
1546    const Register to         = rdx;  // destination array address
1547    const Register count      = rcx;  // elements count
1548
1549    __ enter(); // required for proper stackwalking of RuntimeStub frame
1550    __ push(rsi);
1551    __ push(rdi);
1552    Address  from_arg(rsp, 12+ 4);      // from
1553    Address    to_arg(rsp, 12+ 8);      // to
1554    Address count_arg(rsp, 12+12);      // byte count
1555
1556    // Load up:
1557    __ movptr(from ,  from_arg);
1558    __ movptr(to   ,    to_arg);
1559    __ movl2ptr(count, count_arg);
1560
1561    // bump this on entry, not on exit:
1562    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1563
1564    const Register bits = rsi;
1565    __ mov(bits, from);
1566    __ orptr(bits, to);
1567    __ orptr(bits, count);
1568
1569    __ testl(bits, BytesPerLong-1);
1570    __ jccb(Assembler::zero, L_long_aligned);
1571
1572    __ testl(bits, BytesPerInt-1);
1573    __ jccb(Assembler::zero, L_int_aligned);
1574
1575    __ testl(bits, BytesPerShort-1);
1576    __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
1577
1578    __ BIND(L_short_aligned);
1579    __ shrptr(count, LogBytesPerShort); // size => short_count
1580    __ movl(count_arg, count);          // update 'count'
1581    __ jump(RuntimeAddress(short_copy_entry));
1582
1583    __ BIND(L_int_aligned);
1584    __ shrptr(count, LogBytesPerInt); // size => int_count
1585    __ movl(count_arg, count);          // update 'count'
1586    __ jump(RuntimeAddress(int_copy_entry));
1587
1588    __ BIND(L_long_aligned);
1589    __ shrptr(count, LogBytesPerLong); // size => qword_count
1590    __ movl(count_arg, count);          // update 'count'
1591    __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1592    __ pop(rsi);
1593    __ jump(RuntimeAddress(long_copy_entry));
1594
1595    return start;
1596  }
1597
1598
1599  // Perform range checks on the proposed arraycopy.
1600  // Smashes src_pos and dst_pos.  (Uses them up for temps.)
1601  void arraycopy_range_checks(Register src,
1602                              Register src_pos,
1603                              Register dst,
1604                              Register dst_pos,
1605                              Address& length,
1606                              Label& L_failed) {
1607    BLOCK_COMMENT("arraycopy_range_checks:");
1608    const Register src_end = src_pos;   // source array end position
1609    const Register dst_end = dst_pos;   // destination array end position
1610    __ addl(src_end, length); // src_pos + length
1611    __ addl(dst_end, length); // dst_pos + length
1612
1613    //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
1614    __ cmpl(src_end, Address(src, arrayOopDesc::length_offset_in_bytes()));
1615    __ jcc(Assembler::above, L_failed);
1616
1617    //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
1618    __ cmpl(dst_end, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1619    __ jcc(Assembler::above, L_failed);
1620
1621    BLOCK_COMMENT("arraycopy_range_checks done");
1622  }
1623
1624
1625  //
1626  //  Generate generic array copy stubs
1627  //
1628  //  Input:
1629  //     4(rsp)    -  src oop
1630  //     8(rsp)    -  src_pos
1631  //    12(rsp)    -  dst oop
1632  //    16(rsp)    -  dst_pos
1633  //    20(rsp)    -  element count
1634  //
1635  //  Output:
1636  //    rax, ==  0  -  success
1637  //    rax, == -1^K - failure, where K is partial transfer count
1638  //
1639  address generate_generic_copy(const char *name,
1640                                address entry_jbyte_arraycopy,
1641                                address entry_jshort_arraycopy,
1642                                address entry_jint_arraycopy,
1643                                address entry_oop_arraycopy,
1644                                address entry_jlong_arraycopy,
1645                                address entry_checkcast_arraycopy) {
1646    Label L_failed, L_failed_0, L_objArray;
1647
1648    { int modulus = CodeEntryAlignment;
1649      int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
1650      int advance = target - (__ offset() % modulus);
1651      if (advance < 0)  advance += modulus;
1652      if (advance > 0)  __ nop(advance);
1653    }
1654    StubCodeMark mark(this, "StubRoutines", name);
1655
1656    // Short-hop target to L_failed.  Makes for denser prologue code.
1657    __ BIND(L_failed_0);
1658    __ jmp(L_failed);
1659    assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
1660
1661    __ align(CodeEntryAlignment);
1662    address start = __ pc();
1663
1664    __ enter(); // required for proper stackwalking of RuntimeStub frame
1665    __ push(rsi);
1666    __ push(rdi);
1667
1668    // bump this on entry, not on exit:
1669    inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1670
1671    // Input values
1672    Address SRC     (rsp, 12+ 4);
1673    Address SRC_POS (rsp, 12+ 8);
1674    Address DST     (rsp, 12+12);
1675    Address DST_POS (rsp, 12+16);
1676    Address LENGTH  (rsp, 12+20);
1677
1678    //-----------------------------------------------------------------------
1679    // Assembler stub will be used for this call to arraycopy
1680    // if the following conditions are met:
1681    //
1682    // (1) src and dst must not be null.
1683    // (2) src_pos must not be negative.
1684    // (3) dst_pos must not be negative.
1685    // (4) length  must not be negative.
1686    // (5) src klass and dst klass should be the same and not NULL.
1687    // (6) src and dst should be arrays.
1688    // (7) src_pos + length must not exceed length of src.
1689    // (8) dst_pos + length must not exceed length of dst.
1690    //
1691
1692    const Register src     = rax;       // source array oop
1693    const Register src_pos = rsi;
1694    const Register dst     = rdx;       // destination array oop
1695    const Register dst_pos = rdi;
1696    const Register length  = rcx;       // transfer count
1697
1698    //  if (src == NULL) return -1;
1699    __ movptr(src, SRC);      // src oop
1700    __ testptr(src, src);
1701    __ jccb(Assembler::zero, L_failed_0);
1702
1703    //  if (src_pos < 0) return -1;
1704    __ movl2ptr(src_pos, SRC_POS);  // src_pos
1705    __ testl(src_pos, src_pos);
1706    __ jccb(Assembler::negative, L_failed_0);
1707
1708    //  if (dst == NULL) return -1;
1709    __ movptr(dst, DST);      // dst oop
1710    __ testptr(dst, dst);
1711    __ jccb(Assembler::zero, L_failed_0);
1712
1713    //  if (dst_pos < 0) return -1;
1714    __ movl2ptr(dst_pos, DST_POS);  // dst_pos
1715    __ testl(dst_pos, dst_pos);
1716    __ jccb(Assembler::negative, L_failed_0);
1717
1718    //  if (length < 0) return -1;
1719    __ movl2ptr(length, LENGTH);   // length
1720    __ testl(length, length);
1721    __ jccb(Assembler::negative, L_failed_0);
1722
1723    //  if (src->klass() == NULL) return -1;
1724    Address src_klass_addr(src, oopDesc::klass_offset_in_bytes());
1725    Address dst_klass_addr(dst, oopDesc::klass_offset_in_bytes());
1726    const Register rcx_src_klass = rcx;    // array klass
1727    __ movptr(rcx_src_klass, Address(src, oopDesc::klass_offset_in_bytes()));
1728
1729#ifdef ASSERT
1730    //  assert(src->klass() != NULL);
1731    BLOCK_COMMENT("assert klasses not null");
1732    { Label L1, L2;
1733      __ testptr(rcx_src_klass, rcx_src_klass);
1734      __ jccb(Assembler::notZero, L2);   // it is broken if klass is NULL
1735      __ bind(L1);
1736      __ stop("broken null klass");
1737      __ bind(L2);
1738      __ cmpptr(dst_klass_addr, (int32_t)NULL_WORD);
1739      __ jccb(Assembler::equal, L1);      // this would be broken also
1740      BLOCK_COMMENT("assert done");
1741    }
1742#endif //ASSERT
1743
1744    // Load layout helper (32-bits)
1745    //
1746    //  |array_tag|     | header_size | element_type |     |log2_element_size|
1747    // 32        30    24            16              8     2                 0
1748    //
1749    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1750    //
1751
1752    int lh_offset = in_bytes(Klass::layout_helper_offset());
1753    Address src_klass_lh_addr(rcx_src_klass, lh_offset);
1754
1755    // Handle objArrays completely differently...
1756    jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1757    __ cmpl(src_klass_lh_addr, objArray_lh);
1758    __ jcc(Assembler::equal, L_objArray);
1759
1760    //  if (src->klass() != dst->klass()) return -1;
1761    __ cmpptr(rcx_src_klass, dst_klass_addr);
1762    __ jccb(Assembler::notEqual, L_failed_0);
1763
1764    const Register rcx_lh = rcx;  // layout helper
1765    assert(rcx_lh == rcx_src_klass, "known alias");
1766    __ movl(rcx_lh, src_klass_lh_addr);
1767
1768    //  if (!src->is_Array()) return -1;
1769    __ cmpl(rcx_lh, Klass::_lh_neutral_value);
1770    __ jcc(Assembler::greaterEqual, L_failed_0); // signed cmp
1771
1772    // At this point, it is known to be a typeArray (array_tag 0x3).
1773#ifdef ASSERT
1774    { Label L;
1775      __ cmpl(rcx_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1776      __ jcc(Assembler::greaterEqual, L); // signed cmp
1777      __ stop("must be a primitive array");
1778      __ bind(L);
1779    }
1780#endif
1781
1782    assert_different_registers(src, src_pos, dst, dst_pos, rcx_lh);
1783    arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1784
1785    // TypeArrayKlass
1786    //
1787    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
1788    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
1789    //
1790    const Register rsi_offset = rsi; // array offset
1791    const Register src_array  = src; // src array offset
1792    const Register dst_array  = dst; // dst array offset
1793    const Register rdi_elsize = rdi; // log2 element size
1794
1795    __ mov(rsi_offset, rcx_lh);
1796    __ shrptr(rsi_offset, Klass::_lh_header_size_shift);
1797    __ andptr(rsi_offset, Klass::_lh_header_size_mask);   // array_offset
1798    __ addptr(src_array, rsi_offset);  // src array offset
1799    __ addptr(dst_array, rsi_offset);  // dst array offset
1800    __ andptr(rcx_lh, Klass::_lh_log2_element_size_mask); // log2 elsize
1801
1802    // next registers should be set before the jump to corresponding stub
1803    const Register from       = src; // source array address
1804    const Register to         = dst; // destination array address
1805    const Register count      = rcx; // elements count
1806    // some of them should be duplicated on stack
1807#define FROM   Address(rsp, 12+ 4)
1808#define TO     Address(rsp, 12+ 8)   // Not used now
1809#define COUNT  Address(rsp, 12+12)   // Only for oop arraycopy
1810
1811    BLOCK_COMMENT("scale indexes to element size");
1812    __ movl2ptr(rsi, SRC_POS);  // src_pos
1813    __ shlptr(rsi);             // src_pos << rcx (log2 elsize)
1814    assert(src_array == from, "");
1815    __ addptr(from, rsi);       // from = src_array + SRC_POS << log2 elsize
1816    __ movl2ptr(rdi, DST_POS);  // dst_pos
1817    __ shlptr(rdi);             // dst_pos << rcx (log2 elsize)
1818    assert(dst_array == to, "");
1819    __ addptr(to,  rdi);        // to   = dst_array + DST_POS << log2 elsize
1820    __ movptr(FROM, from);      // src_addr
1821    __ mov(rdi_elsize, rcx_lh); // log2 elsize
1822    __ movl2ptr(count, LENGTH); // elements count
1823
1824    BLOCK_COMMENT("choose copy loop based on element size");
1825    __ cmpl(rdi_elsize, 0);
1826
1827    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jbyte_arraycopy));
1828    __ cmpl(rdi_elsize, LogBytesPerShort);
1829    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jshort_arraycopy));
1830    __ cmpl(rdi_elsize, LogBytesPerInt);
1831    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jint_arraycopy));
1832#ifdef ASSERT
1833    __ cmpl(rdi_elsize, LogBytesPerLong);
1834    __ jccb(Assembler::notEqual, L_failed);
1835#endif
1836    __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1837    __ pop(rsi);
1838    __ jump(RuntimeAddress(entry_jlong_arraycopy));
1839
1840  __ BIND(L_failed);
1841    __ xorptr(rax, rax);
1842    __ notptr(rax); // return -1
1843    __ pop(rdi);
1844    __ pop(rsi);
1845    __ leave(); // required for proper stackwalking of RuntimeStub frame
1846    __ ret(0);
1847
1848    // ObjArrayKlass
1849  __ BIND(L_objArray);
1850    // live at this point:  rcx_src_klass, src[_pos], dst[_pos]
1851
1852    Label L_plain_copy, L_checkcast_copy;
1853    //  test array classes for subtyping
1854    __ cmpptr(rcx_src_klass, dst_klass_addr); // usual case is exact equality
1855    __ jccb(Assembler::notEqual, L_checkcast_copy);
1856
1857    // Identically typed arrays can be copied without element-wise checks.
1858    assert_different_registers(src, src_pos, dst, dst_pos, rcx_src_klass);
1859    arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1860
1861  __ BIND(L_plain_copy);
1862    __ movl2ptr(count, LENGTH); // elements count
1863    __ movl2ptr(src_pos, SRC_POS);  // reload src_pos
1864    __ lea(from, Address(src, src_pos, Address::times_ptr,
1865                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
1866    __ movl2ptr(dst_pos, DST_POS);  // reload dst_pos
1867    __ lea(to,   Address(dst, dst_pos, Address::times_ptr,
1868                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
1869    __ movptr(FROM,  from);   // src_addr
1870    __ movptr(TO,    to);     // dst_addr
1871    __ movl(COUNT, count);  // count
1872    __ jump(RuntimeAddress(entry_oop_arraycopy));
1873
1874  __ BIND(L_checkcast_copy);
1875    // live at this point:  rcx_src_klass, dst[_pos], src[_pos]
1876    {
1877      // Handy offsets:
1878      int  ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1879      int sco_offset = in_bytes(Klass::super_check_offset_offset());
1880
1881      Register rsi_dst_klass = rsi;
1882      Register rdi_temp      = rdi;
1883      assert(rsi_dst_klass == src_pos, "expected alias w/ src_pos");
1884      assert(rdi_temp      == dst_pos, "expected alias w/ dst_pos");
1885      Address dst_klass_lh_addr(rsi_dst_klass, lh_offset);
1886
1887      // Before looking at dst.length, make sure dst is also an objArray.
1888      __ movptr(rsi_dst_klass, dst_klass_addr);
1889      __ cmpl(dst_klass_lh_addr, objArray_lh);
1890      __ jccb(Assembler::notEqual, L_failed);
1891
1892      // It is safe to examine both src.length and dst.length.
1893      __ movl2ptr(src_pos, SRC_POS);        // reload rsi
1894      arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1895      // (Now src_pos and dst_pos are killed, but not src and dst.)
1896
1897      // We'll need this temp (don't forget to pop it after the type check).
1898      __ push(rbx);
1899      Register rbx_src_klass = rbx;
1900
1901      __ mov(rbx_src_klass, rcx_src_klass); // spill away from rcx
1902      __ movptr(rsi_dst_klass, dst_klass_addr);
1903      Address super_check_offset_addr(rsi_dst_klass, sco_offset);
1904      Label L_fail_array_check;
1905      generate_type_check(rbx_src_klass,
1906                          super_check_offset_addr, dst_klass_addr,
1907                          rdi_temp, NULL, &L_fail_array_check);
1908      // (On fall-through, we have passed the array type check.)
1909      __ pop(rbx);
1910      __ jmp(L_plain_copy);
1911
1912      __ BIND(L_fail_array_check);
1913      // Reshuffle arguments so we can call checkcast_arraycopy:
1914
1915      // match initial saves for checkcast_arraycopy
1916      // push(rsi);    // already done; see above
1917      // push(rdi);    // already done; see above
1918      // push(rbx);    // already done; see above
1919
1920      // Marshal outgoing arguments now, freeing registers.
1921      Address   from_arg(rsp, 16+ 4);   // from
1922      Address     to_arg(rsp, 16+ 8);   // to
1923      Address length_arg(rsp, 16+12);   // elements count
1924      Address  ckoff_arg(rsp, 16+16);   // super_check_offset
1925      Address  ckval_arg(rsp, 16+20);   // super_klass
1926
1927      Address SRC_POS_arg(rsp, 16+ 8);
1928      Address DST_POS_arg(rsp, 16+16);
1929      Address  LENGTH_arg(rsp, 16+20);
1930      // push rbx, changed the incoming offsets (why not just use rbp,??)
1931      // assert(SRC_POS_arg.disp() == SRC_POS.disp() + 4, "");
1932
1933      __ movptr(rbx, Address(rsi_dst_klass, ek_offset));
1934      __ movl2ptr(length, LENGTH_arg);    // reload elements count
1935      __ movl2ptr(src_pos, SRC_POS_arg);  // reload src_pos
1936      __ movl2ptr(dst_pos, DST_POS_arg);  // reload dst_pos
1937
1938      __ movptr(ckval_arg, rbx);          // destination element type
1939      __ movl(rbx, Address(rbx, sco_offset));
1940      __ movl(ckoff_arg, rbx);          // corresponding class check offset
1941
1942      __ movl(length_arg, length);      // outgoing length argument
1943
1944      __ lea(from, Address(src, src_pos, Address::times_ptr,
1945                            arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
1946      __ movptr(from_arg, from);
1947
1948      __ lea(to, Address(dst, dst_pos, Address::times_ptr,
1949                          arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
1950      __ movptr(to_arg, to);
1951      __ jump(RuntimeAddress(entry_checkcast_arraycopy));
1952    }
1953
1954    return start;
1955  }
1956
1957  void generate_arraycopy_stubs() {
1958    address entry;
1959    address entry_jbyte_arraycopy;
1960    address entry_jshort_arraycopy;
1961    address entry_jint_arraycopy;
1962    address entry_oop_arraycopy;
1963    address entry_jlong_arraycopy;
1964    address entry_checkcast_arraycopy;
1965
1966    StubRoutines::_arrayof_jbyte_disjoint_arraycopy =
1967        generate_disjoint_copy(T_BYTE,  true, Address::times_1, &entry,
1968                               "arrayof_jbyte_disjoint_arraycopy");
1969    StubRoutines::_arrayof_jbyte_arraycopy =
1970        generate_conjoint_copy(T_BYTE,  true, Address::times_1,  entry,
1971                               NULL, "arrayof_jbyte_arraycopy");
1972    StubRoutines::_jbyte_disjoint_arraycopy =
1973        generate_disjoint_copy(T_BYTE, false, Address::times_1, &entry,
1974                               "jbyte_disjoint_arraycopy");
1975    StubRoutines::_jbyte_arraycopy =
1976        generate_conjoint_copy(T_BYTE, false, Address::times_1,  entry,
1977                               &entry_jbyte_arraycopy, "jbyte_arraycopy");
1978
1979    StubRoutines::_arrayof_jshort_disjoint_arraycopy =
1980        generate_disjoint_copy(T_SHORT,  true, Address::times_2, &entry,
1981                               "arrayof_jshort_disjoint_arraycopy");
1982    StubRoutines::_arrayof_jshort_arraycopy =
1983        generate_conjoint_copy(T_SHORT,  true, Address::times_2,  entry,
1984                               NULL, "arrayof_jshort_arraycopy");
1985    StubRoutines::_jshort_disjoint_arraycopy =
1986        generate_disjoint_copy(T_SHORT, false, Address::times_2, &entry,
1987                               "jshort_disjoint_arraycopy");
1988    StubRoutines::_jshort_arraycopy =
1989        generate_conjoint_copy(T_SHORT, false, Address::times_2,  entry,
1990                               &entry_jshort_arraycopy, "jshort_arraycopy");
1991
1992    // Next arrays are always aligned on 4 bytes at least.
1993    StubRoutines::_jint_disjoint_arraycopy =
1994        generate_disjoint_copy(T_INT, true, Address::times_4, &entry,
1995                               "jint_disjoint_arraycopy");
1996    StubRoutines::_jint_arraycopy =
1997        generate_conjoint_copy(T_INT, true, Address::times_4,  entry,
1998                               &entry_jint_arraycopy, "jint_arraycopy");
1999
2000    StubRoutines::_oop_disjoint_arraycopy =
2001        generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2002                               "oop_disjoint_arraycopy");
2003    StubRoutines::_oop_arraycopy =
2004        generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
2005                               &entry_oop_arraycopy, "oop_arraycopy");
2006
2007    StubRoutines::_oop_disjoint_arraycopy_uninit =
2008        generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2009                               "oop_disjoint_arraycopy_uninit",
2010                               /*dest_uninitialized*/true);
2011    StubRoutines::_oop_arraycopy_uninit =
2012        generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
2013                               NULL, "oop_arraycopy_uninit",
2014                               /*dest_uninitialized*/true);
2015
2016    StubRoutines::_jlong_disjoint_arraycopy =
2017        generate_disjoint_long_copy(&entry, "jlong_disjoint_arraycopy");
2018    StubRoutines::_jlong_arraycopy =
2019        generate_conjoint_long_copy(entry, &entry_jlong_arraycopy,
2020                                    "jlong_arraycopy");
2021
2022    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2023    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2024    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2025    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2026    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2027    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2028
2029    StubRoutines::_arrayof_jint_disjoint_arraycopy       = StubRoutines::_jint_disjoint_arraycopy;
2030    StubRoutines::_arrayof_oop_disjoint_arraycopy        = StubRoutines::_oop_disjoint_arraycopy;
2031    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
2032    StubRoutines::_arrayof_jlong_disjoint_arraycopy      = StubRoutines::_jlong_disjoint_arraycopy;
2033
2034    StubRoutines::_arrayof_jint_arraycopy       = StubRoutines::_jint_arraycopy;
2035    StubRoutines::_arrayof_oop_arraycopy        = StubRoutines::_oop_arraycopy;
2036    StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
2037    StubRoutines::_arrayof_jlong_arraycopy      = StubRoutines::_jlong_arraycopy;
2038
2039    StubRoutines::_checkcast_arraycopy =
2040        generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2041    StubRoutines::_checkcast_arraycopy_uninit =
2042        generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, /*dest_uninitialized*/true);
2043
2044    StubRoutines::_unsafe_arraycopy =
2045        generate_unsafe_copy("unsafe_arraycopy",
2046                               entry_jbyte_arraycopy,
2047                               entry_jshort_arraycopy,
2048                               entry_jint_arraycopy,
2049                               entry_jlong_arraycopy);
2050
2051    StubRoutines::_generic_arraycopy =
2052        generate_generic_copy("generic_arraycopy",
2053                               entry_jbyte_arraycopy,
2054                               entry_jshort_arraycopy,
2055                               entry_jint_arraycopy,
2056                               entry_oop_arraycopy,
2057                               entry_jlong_arraycopy,
2058                               entry_checkcast_arraycopy);
2059  }
2060
2061  // AES intrinsic stubs
2062  enum {AESBlockSize = 16};
2063
2064  address generate_key_shuffle_mask() {
2065    __ align(16);
2066    StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2067    address start = __ pc();
2068    __ emit_data(0x00010203, relocInfo::none, 0 );
2069    __ emit_data(0x04050607, relocInfo::none, 0 );
2070    __ emit_data(0x08090a0b, relocInfo::none, 0 );
2071    __ emit_data(0x0c0d0e0f, relocInfo::none, 0 );
2072    return start;
2073  }
2074
2075  address generate_counter_shuffle_mask() {
2076    __ align(16);
2077    StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
2078    address start = __ pc();
2079    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
2080    __ emit_data(0x08090a0b, relocInfo::none, 0);
2081    __ emit_data(0x04050607, relocInfo::none, 0);
2082    __ emit_data(0x00010203, relocInfo::none, 0);
2083    return start;
2084  }
2085
2086  // Utility routine for loading a 128-bit key word in little endian format
2087  // can optionally specify that the shuffle mask is already in an xmmregister
2088  void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2089    __ movdqu(xmmdst, Address(key, offset));
2090    if (xmm_shuf_mask != NULL) {
2091      __ pshufb(xmmdst, xmm_shuf_mask);
2092    } else {
2093      __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2094    }
2095  }
2096
2097  // aesenc using specified key+offset
2098  // can optionally specify that the shuffle mask is already in an xmmregister
2099  void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2100    load_key(xmmtmp, key, offset, xmm_shuf_mask);
2101    __ aesenc(xmmdst, xmmtmp);
2102  }
2103
2104  // aesdec using specified key+offset
2105  // can optionally specify that the shuffle mask is already in an xmmregister
2106  void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2107    load_key(xmmtmp, key, offset, xmm_shuf_mask);
2108    __ aesdec(xmmdst, xmmtmp);
2109  }
2110
2111  // Utility routine for increase 128bit counter (iv in CTR mode)
2112  //  XMM_128bit,  D3, D2, D1, D0
2113  void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
2114    __ pextrd(reg, xmmdst, 0x0);
2115    __ addl(reg, inc_delta);
2116    __ pinsrd(xmmdst, reg, 0x0);
2117    __ jcc(Assembler::carryClear, next_block); // jump if no carry
2118
2119    __ pextrd(reg, xmmdst, 0x01); // Carry-> D1
2120    __ addl(reg, 0x01);
2121    __ pinsrd(xmmdst, reg, 0x01);
2122    __ jcc(Assembler::carryClear, next_block); // jump if no carry
2123
2124    __ pextrd(reg, xmmdst, 0x02); // Carry-> D2
2125    __ addl(reg, 0x01);
2126    __ pinsrd(xmmdst, reg, 0x02);
2127    __ jcc(Assembler::carryClear, next_block); // jump if no carry
2128
2129    __ pextrd(reg, xmmdst, 0x03); // Carry -> D3
2130    __ addl(reg, 0x01);
2131    __ pinsrd(xmmdst, reg, 0x03);
2132
2133    __ BIND(next_block);          // next instruction
2134  }
2135
2136
2137  // Arguments:
2138  //
2139  // Inputs:
2140  //   c_rarg0   - source byte array address
2141  //   c_rarg1   - destination byte array address
2142  //   c_rarg2   - K (key) in little endian int array
2143  //
2144  address generate_aescrypt_encryptBlock() {
2145    assert(UseAES, "need AES instructions and misaligned SSE support");
2146    __ align(CodeEntryAlignment);
2147    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2148    Label L_doLast;
2149    address start = __ pc();
2150
2151    const Register from        = rdx;      // source array address
2152    const Register to          = rdx;      // destination array address
2153    const Register key         = rcx;      // key array address
2154    const Register keylen      = rax;
2155    const Address  from_param(rbp, 8+0);
2156    const Address  to_param  (rbp, 8+4);
2157    const Address  key_param (rbp, 8+8);
2158
2159    const XMMRegister xmm_result = xmm0;
2160    const XMMRegister xmm_key_shuf_mask = xmm1;
2161    const XMMRegister xmm_temp1  = xmm2;
2162    const XMMRegister xmm_temp2  = xmm3;
2163    const XMMRegister xmm_temp3  = xmm4;
2164    const XMMRegister xmm_temp4  = xmm5;
2165
2166    __ enter();   // required for proper stackwalking of RuntimeStub frame
2167
2168    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
2169    // context for the registers used, where all instructions below are using 128-bit mode
2170    // On EVEX without VL and BW, these instructions will all be AVX.
2171    if (VM_Version::supports_avx512vlbw()) {
2172      __ movl(rdx, 0xffff);
2173      __ kmovdl(k1, rdx);
2174    }
2175
2176    __ movptr(from, from_param);
2177    __ movptr(key, key_param);
2178
2179    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2180    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2181
2182    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2183    __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
2184    __ movptr(to, to_param);
2185
2186    // For encryption, the java expanded key ordering is just what we need
2187
2188    load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
2189    __ pxor(xmm_result, xmm_temp1);
2190
2191    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2192    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2193    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2194    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2195
2196    __ aesenc(xmm_result, xmm_temp1);
2197    __ aesenc(xmm_result, xmm_temp2);
2198    __ aesenc(xmm_result, xmm_temp3);
2199    __ aesenc(xmm_result, xmm_temp4);
2200
2201    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2202    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2203    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2204    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2205
2206    __ aesenc(xmm_result, xmm_temp1);
2207    __ aesenc(xmm_result, xmm_temp2);
2208    __ aesenc(xmm_result, xmm_temp3);
2209    __ aesenc(xmm_result, xmm_temp4);
2210
2211    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2212    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2213
2214    __ cmpl(keylen, 44);
2215    __ jccb(Assembler::equal, L_doLast);
2216
2217    __ aesenc(xmm_result, xmm_temp1);
2218    __ aesenc(xmm_result, xmm_temp2);
2219
2220    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2221    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2222
2223    __ cmpl(keylen, 52);
2224    __ jccb(Assembler::equal, L_doLast);
2225
2226    __ aesenc(xmm_result, xmm_temp1);
2227    __ aesenc(xmm_result, xmm_temp2);
2228
2229    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2230    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2231
2232    __ BIND(L_doLast);
2233    __ aesenc(xmm_result, xmm_temp1);
2234    __ aesenclast(xmm_result, xmm_temp2);
2235    __ movdqu(Address(to, 0), xmm_result);        // store the result
2236    __ xorptr(rax, rax); // return 0
2237    __ leave(); // required for proper stackwalking of RuntimeStub frame
2238    __ ret(0);
2239
2240    return start;
2241  }
2242
2243
2244  // Arguments:
2245  //
2246  // Inputs:
2247  //   c_rarg0   - source byte array address
2248  //   c_rarg1   - destination byte array address
2249  //   c_rarg2   - K (key) in little endian int array
2250  //
2251  address generate_aescrypt_decryptBlock() {
2252    assert(UseAES, "need AES instructions and misaligned SSE support");
2253    __ align(CodeEntryAlignment);
2254    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2255    Label L_doLast;
2256    address start = __ pc();
2257
2258    const Register from        = rdx;      // source array address
2259    const Register to          = rdx;      // destination array address
2260    const Register key         = rcx;      // key array address
2261    const Register keylen      = rax;
2262    const Address  from_param(rbp, 8+0);
2263    const Address  to_param  (rbp, 8+4);
2264    const Address  key_param (rbp, 8+8);
2265
2266    const XMMRegister xmm_result = xmm0;
2267    const XMMRegister xmm_key_shuf_mask = xmm1;
2268    const XMMRegister xmm_temp1  = xmm2;
2269    const XMMRegister xmm_temp2  = xmm3;
2270    const XMMRegister xmm_temp3  = xmm4;
2271    const XMMRegister xmm_temp4  = xmm5;
2272
2273    __ enter(); // required for proper stackwalking of RuntimeStub frame
2274
2275    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
2276    // context for the registers used, where all instructions below are using 128-bit mode
2277    // On EVEX without VL and BW, these instructions will all be AVX.
2278    if (VM_Version::supports_avx512vlbw()) {
2279      __ movl(rdx, 0xffff);
2280      __ kmovdl(k1, rdx);
2281    }
2282
2283    __ movptr(from, from_param);
2284    __ movptr(key, key_param);
2285
2286    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2287    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2288
2289    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2290    __ movdqu(xmm_result, Address(from, 0));
2291    __ movptr(to, to_param);
2292
2293    // for decryption java expanded key ordering is rotated one position from what we want
2294    // so we start from 0x10 here and hit 0x00 last
2295    // we don't know if the key is aligned, hence not using load-execute form
2296    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2297    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2298    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2299    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2300
2301    __ pxor  (xmm_result, xmm_temp1);
2302    __ aesdec(xmm_result, xmm_temp2);
2303    __ aesdec(xmm_result, xmm_temp3);
2304    __ aesdec(xmm_result, xmm_temp4);
2305
2306    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2307    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2308    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2309    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2310
2311    __ aesdec(xmm_result, xmm_temp1);
2312    __ aesdec(xmm_result, xmm_temp2);
2313    __ aesdec(xmm_result, xmm_temp3);
2314    __ aesdec(xmm_result, xmm_temp4);
2315
2316    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2317    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2318    load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
2319
2320    __ cmpl(keylen, 44);
2321    __ jccb(Assembler::equal, L_doLast);
2322
2323    __ aesdec(xmm_result, xmm_temp1);
2324    __ aesdec(xmm_result, xmm_temp2);
2325
2326    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2327    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2328
2329    __ cmpl(keylen, 52);
2330    __ jccb(Assembler::equal, L_doLast);
2331
2332    __ aesdec(xmm_result, xmm_temp1);
2333    __ aesdec(xmm_result, xmm_temp2);
2334
2335    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2336    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2337
2338    __ BIND(L_doLast);
2339    __ aesdec(xmm_result, xmm_temp1);
2340    __ aesdec(xmm_result, xmm_temp2);
2341
2342    // for decryption the aesdeclast operation is always on key+0x00
2343    __ aesdeclast(xmm_result, xmm_temp3);
2344    __ movdqu(Address(to, 0), xmm_result);  // store the result
2345    __ xorptr(rax, rax); // return 0
2346    __ leave(); // required for proper stackwalking of RuntimeStub frame
2347    __ ret(0);
2348
2349    return start;
2350  }
2351
2352  void handleSOERegisters(bool saving) {
2353    const int saveFrameSizeInBytes = 4 * wordSize;
2354    const Address saved_rbx     (rbp, -3 * wordSize);
2355    const Address saved_rsi     (rbp, -2 * wordSize);
2356    const Address saved_rdi     (rbp, -1 * wordSize);
2357
2358    if (saving) {
2359      __ subptr(rsp, saveFrameSizeInBytes);
2360      __ movptr(saved_rsi, rsi);
2361      __ movptr(saved_rdi, rdi);
2362      __ movptr(saved_rbx, rbx);
2363    } else {
2364      // restoring
2365      __ movptr(rsi, saved_rsi);
2366      __ movptr(rdi, saved_rdi);
2367      __ movptr(rbx, saved_rbx);
2368    }
2369  }
2370
2371  // Arguments:
2372  //
2373  // Inputs:
2374  //   c_rarg0   - source byte array address
2375  //   c_rarg1   - destination byte array address
2376  //   c_rarg2   - K (key) in little endian int array
2377  //   c_rarg3   - r vector byte array address
2378  //   c_rarg4   - input length
2379  //
2380  // Output:
2381  //   rax       - input length
2382  //
2383  address generate_cipherBlockChaining_encryptAESCrypt() {
2384    assert(UseAES, "need AES instructions and misaligned SSE support");
2385    __ align(CodeEntryAlignment);
2386    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2387    address start = __ pc();
2388
2389    Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
2390    const Register from        = rsi;      // source array address
2391    const Register to          = rdx;      // destination array address
2392    const Register key         = rcx;      // key array address
2393    const Register rvec        = rdi;      // r byte array initialized from initvector array address
2394                                           // and left with the results of the last encryption block
2395    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
2396    const Register pos         = rax;
2397
2398    // xmm register assignments for the loops below
2399    const XMMRegister xmm_result = xmm0;
2400    const XMMRegister xmm_temp   = xmm1;
2401    // first 6 keys preloaded into xmm2-xmm7
2402    const int XMM_REG_NUM_KEY_FIRST = 2;
2403    const int XMM_REG_NUM_KEY_LAST  = 7;
2404    const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
2405
2406    __ enter(); // required for proper stackwalking of RuntimeStub frame
2407    handleSOERegisters(true /*saving*/);
2408
2409    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
2410    // context for the registers used, where all instructions below are using 128-bit mode
2411    // On EVEX without VL and BW, these instructions will all be AVX.
2412    if (VM_Version::supports_avx512vlbw()) {
2413      __ movl(rdx, 0xffff);
2414      __ kmovdl(k1, rdx);
2415    }
2416
2417    // load registers from incoming parameters
2418    const Address  from_param(rbp, 8+0);
2419    const Address  to_param  (rbp, 8+4);
2420    const Address  key_param (rbp, 8+8);
2421    const Address  rvec_param (rbp, 8+12);
2422    const Address  len_param  (rbp, 8+16);
2423    __ movptr(from , from_param);
2424    __ movptr(to   , to_param);
2425    __ movptr(key  , key_param);
2426    __ movptr(rvec , rvec_param);
2427    __ movptr(len_reg , len_param);
2428
2429    const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
2430    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2431    // load up xmm regs 2 thru 7 with keys 0-5
2432    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2433      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
2434      offset += 0x10;
2435    }
2436
2437    __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
2438
2439    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2440    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2441    __ cmpl(rax, 44);
2442    __ jcc(Assembler::notEqual, L_key_192_256);
2443
2444    // 128 bit code follows here
2445    __ movl(pos, 0);
2446    __ align(OptoLoopAlignment);
2447    __ BIND(L_loopTop_128);
2448    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2449    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2450
2451    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2452    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2453      __ aesenc(xmm_result, as_XMMRegister(rnum));
2454    }
2455    for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) {
2456      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2457    }
2458    load_key(xmm_temp, key, 0xa0);
2459    __ aesenclast(xmm_result, xmm_temp);
2460
2461    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
2462    // no need to store r to memory until we exit
2463    __ addptr(pos, AESBlockSize);
2464    __ subptr(len_reg, AESBlockSize);
2465    __ jcc(Assembler::notEqual, L_loopTop_128);
2466
2467    __ BIND(L_exit);
2468    __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
2469
2470    handleSOERegisters(false /*restoring*/);
2471    __ movptr(rax, len_param); // return length
2472    __ leave();                                  // required for proper stackwalking of RuntimeStub frame
2473    __ ret(0);
2474
2475    __ BIND(L_key_192_256);
2476    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
2477    __ cmpl(rax, 52);
2478    __ jcc(Assembler::notEqual, L_key_256);
2479
2480    // 192-bit code follows here (could be changed to use more xmm registers)
2481    __ movl(pos, 0);
2482    __ align(OptoLoopAlignment);
2483    __ BIND(L_loopTop_192);
2484    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2485    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2486
2487    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2488    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2489      __ aesenc(xmm_result, as_XMMRegister(rnum));
2490    }
2491    for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) {
2492      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2493    }
2494    load_key(xmm_temp, key, 0xc0);
2495    __ aesenclast(xmm_result, xmm_temp);
2496
2497    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
2498    // no need to store r to memory until we exit
2499    __ addptr(pos, AESBlockSize);
2500    __ subptr(len_reg, AESBlockSize);
2501    __ jcc(Assembler::notEqual, L_loopTop_192);
2502    __ jmp(L_exit);
2503
2504    __ BIND(L_key_256);
2505    // 256-bit code follows here (could be changed to use more xmm registers)
2506    __ movl(pos, 0);
2507    __ align(OptoLoopAlignment);
2508    __ BIND(L_loopTop_256);
2509    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2510    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2511
2512    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2513    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2514      __ aesenc(xmm_result, as_XMMRegister(rnum));
2515    }
2516    for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) {
2517      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2518    }
2519    load_key(xmm_temp, key, 0xe0);
2520    __ aesenclast(xmm_result, xmm_temp);
2521
2522    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
2523    // no need to store r to memory until we exit
2524    __ addptr(pos, AESBlockSize);
2525    __ subptr(len_reg, AESBlockSize);
2526    __ jcc(Assembler::notEqual, L_loopTop_256);
2527    __ jmp(L_exit);
2528
2529    return start;
2530  }
2531
2532
2533  // CBC AES Decryption.
2534  // In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time.
2535  //
2536  // Arguments:
2537  //
2538  // Inputs:
2539  //   c_rarg0   - source byte array address
2540  //   c_rarg1   - destination byte array address
2541  //   c_rarg2   - K (key) in little endian int array
2542  //   c_rarg3   - r vector byte array address
2543  //   c_rarg4   - input length
2544  //
2545  // Output:
2546  //   rax       - input length
2547  //
2548
2549  address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
2550    assert(UseAES, "need AES instructions and misaligned SSE support");
2551    __ align(CodeEntryAlignment);
2552    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2553    address start = __ pc();
2554
2555    const Register from        = rsi;      // source array address
2556    const Register to          = rdx;      // destination array address
2557    const Register key         = rcx;      // key array address
2558    const Register rvec        = rdi;      // r byte array initialized from initvector array address
2559                                           // and left with the results of the last encryption block
2560    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
2561    const Register pos         = rax;
2562
2563    const int PARALLEL_FACTOR = 4;
2564    const int ROUNDS[3] = { 10, 12, 14 }; //aes rounds for key128, key192, key256
2565
2566    Label L_exit;
2567    Label L_singleBlock_loopTop[3]; //128, 192, 256
2568    Label L_multiBlock_loopTop[3]; //128, 192, 256
2569
2570    const XMMRegister xmm_prev_block_cipher = xmm0; // holds cipher of previous block
2571    const XMMRegister xmm_key_shuf_mask = xmm1;
2572
2573    const XMMRegister xmm_key_tmp0 = xmm2;
2574    const XMMRegister xmm_key_tmp1 = xmm3;
2575
2576    // registers holding the six results in the parallelized loop
2577    const XMMRegister xmm_result0 = xmm4;
2578    const XMMRegister xmm_result1 = xmm5;
2579    const XMMRegister xmm_result2 = xmm6;
2580    const XMMRegister xmm_result3 = xmm7;
2581
2582    __ enter(); // required for proper stackwalking of RuntimeStub frame
2583    handleSOERegisters(true /*saving*/);
2584
2585    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
2586    // context for the registers used, where all instructions below are using 128-bit mode
2587    // On EVEX without VL and BW, these instructions will all be AVX.
2588    if (VM_Version::supports_avx512vlbw()) {
2589      __ movl(rdx, 0xffff);
2590      __ kmovdl(k1, rdx);
2591    }
2592
2593    // load registers from incoming parameters
2594    const Address  from_param(rbp, 8+0);
2595    const Address  to_param  (rbp, 8+4);
2596    const Address  key_param (rbp, 8+8);
2597    const Address  rvec_param (rbp, 8+12);
2598    const Address  len_param  (rbp, 8+16);
2599
2600    __ movptr(from , from_param);
2601    __ movptr(to   , to_param);
2602    __ movptr(key  , key_param);
2603    __ movptr(rvec , rvec_param);
2604    __ movptr(len_reg , len_param);
2605
2606    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2607    __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
2608
2609    __ xorptr(pos, pos);
2610
2611    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2612    // rvec is reused
2613    __ movl(rvec, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2614    __ cmpl(rvec, 52);
2615    __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
2616    __ cmpl(rvec, 60);
2617    __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
2618
2619#define DoFour(opc, src_reg)           \
2620  __ opc(xmm_result0, src_reg);         \
2621  __ opc(xmm_result1, src_reg);         \
2622  __ opc(xmm_result2, src_reg);         \
2623  __ opc(xmm_result3, src_reg);         \
2624
2625    for (int k = 0; k < 3; ++k) {
2626      __ align(OptoLoopAlignment);
2627      __ BIND(L_multiBlock_loopTop[k]);
2628      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
2629      __ jcc(Assembler::less, L_singleBlock_loopTop[k]);
2630
2631      __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
2632      __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2633      __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2634      __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
2635
2636      // the java expanded key ordering is rotated one position from what we want
2637      // so we start from 0x10 here and hit 0x00 last
2638      load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
2639      DoFour(pxor, xmm_key_tmp0); //xor with first key
2640      // do the aes dec rounds
2641      for (int rnum = 1; rnum <= ROUNDS[k];) {
2642        //load two keys at a time
2643        //k1->0x20, ..., k9->0xa0, k10->0x00
2644        load_key(xmm_key_tmp1, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
2645        load_key(xmm_key_tmp0, key, ((rnum + 2) % (ROUNDS[k] + 1)) * 0x10, xmm_key_shuf_mask); // hit 0x00 last!
2646        DoFour(aesdec, xmm_key_tmp1);
2647        rnum++;
2648        if (rnum != ROUNDS[k]) {
2649          DoFour(aesdec, xmm_key_tmp0);
2650        }
2651        else {
2652          DoFour(aesdeclast, xmm_key_tmp0);
2653        }
2654        rnum++;
2655      }
2656
2657      // for each result, xor with the r vector of previous cipher block
2658      __ pxor(xmm_result0, xmm_prev_block_cipher);
2659      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2660      __ pxor(xmm_result1, xmm_prev_block_cipher);
2661      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2662      __ pxor(xmm_result2, xmm_prev_block_cipher);
2663      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2664      __ pxor(xmm_result3, xmm_prev_block_cipher);
2665      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks
2666
2667            // store 4 results into the next 64 bytes of output
2668       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2669       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
2670       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
2671       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
2672
2673       __ addptr(pos, 4 * AESBlockSize);
2674       __ subptr(len_reg, 4 * AESBlockSize);
2675       __ jmp(L_multiBlock_loopTop[k]);
2676
2677       //singleBlock starts here
2678       __ align(OptoLoopAlignment);
2679       __ BIND(L_singleBlock_loopTop[k]);
2680       __ cmpptr(len_reg, 0); // any blocks left?
2681       __ jcc(Assembler::equal, L_exit);
2682       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
2683       __ movdqa(xmm_result1, xmm_result0);
2684
2685       load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
2686       __ pxor(xmm_result0, xmm_key_tmp0);
2687       // do the aes dec rounds
2688       for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
2689         // the java expanded key ordering is rotated one position from what we want
2690         load_key(xmm_key_tmp0, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
2691         __ aesdec(xmm_result0, xmm_key_tmp0);
2692       }
2693       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
2694       __ aesdeclast(xmm_result0, xmm_key_tmp0);
2695       __ pxor(xmm_result0, xmm_prev_block_cipher); // xor with the current r vector
2696       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result0); // store into the next 16 bytes of output
2697       // no need to store r to memory until we exit
2698       __ movdqa(xmm_prev_block_cipher, xmm_result1); // set up next r vector with cipher input from this block
2699
2700       __ addptr(pos, AESBlockSize);
2701       __ subptr(len_reg, AESBlockSize);
2702       __ jmp(L_singleBlock_loopTop[k]);
2703    }//for 128/192/256
2704
2705    __ BIND(L_exit);
2706    __ movptr(rvec, rvec_param);                        // restore this since reused earlier
2707    __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
2708    handleSOERegisters(false /*restoring*/);
2709    __ movptr(rax, len_param);                          // return length
2710    __ leave();                                         // required for proper stackwalking of RuntimeStub frame
2711    __ ret(0);
2712
2713    return start;
2714  }
2715
2716  // CTR AES crypt.
2717  // In 32-bit stub, parallelize 4 blocks at a time
2718  // Arguments:
2719  //
2720  // Inputs:
2721  //   c_rarg0   - source byte array address
2722  //   c_rarg1   - destination byte array address
2723  //   c_rarg2   - K (key) in little endian int array
2724  //   c_rarg3   - counter vector byte array address
2725  //   c_rarg4   - input length
2726  //
2727  // Output:
2728  //   rax       - input length
2729  //
2730  address generate_counterMode_AESCrypt_Parallel() {
2731    assert(UseAES, "need AES instructions and misaligned SSE support");
2732    __ align(CodeEntryAlignment);
2733    StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2734    address start = __ pc();
2735    const Register from        = rsi;      // source array address
2736    const Register to          = rdx;      // destination array address
2737    const Register key         = rcx;      // key array address
2738    const Register counter     = rdi;      // counter byte array initialized from initvector array address
2739                                           // and updated with the incremented counter in the end
2740    const Register len_reg     = rbx;
2741    const Register pos         = rax;
2742
2743    __ enter(); // required for proper stackwalking of RuntimeStub frame
2744    handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi
2745
2746    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
2747    // context for the registers used, where all instructions below are using 128-bit mode
2748    // On EVEX without VL and BW, these instructions will all be AVX.
2749    if (VM_Version::supports_avx512vlbw()) {
2750      __ movl(rdx, 0xffff);
2751      __ kmovdl(k1, rdx);
2752    }
2753
2754    // load registers from incoming parameters
2755    const Address  from_param(rbp, 8+0);
2756    const Address  to_param  (rbp, 8+4);
2757    const Address  key_param (rbp, 8+8);
2758    const Address  rvec_param (rbp, 8+12);
2759    const Address  len_param  (rbp, 8+16);
2760    const Address  saved_counter_param(rbp, 8 + 20);
2761    const Address  used_addr_param(rbp, 8 + 24);
2762
2763    __ movptr(from , from_param);
2764    __ movptr(to   , to_param);
2765    __ movptr(len_reg , len_param);
2766
2767    // Use the partially used encrpyted counter from last invocation
2768    Label L_exit_preLoop, L_preLoop_start;
2769
2770    // Use the registers 'counter' and 'key' here in this preloop
2771    // to hold of last 2 params 'used' and 'saved_encCounter_start'
2772    Register used = counter;
2773    Register saved_encCounter_start = key;
2774    Register used_addr = saved_encCounter_start;
2775
2776    __ movptr(used_addr, used_addr_param);
2777    __ movptr(used, Address(used_addr, 0));
2778    __ movptr(saved_encCounter_start, saved_counter_param);
2779
2780    __ BIND(L_preLoop_start);
2781    __ cmpptr(used, 16);
2782    __ jcc(Assembler::aboveEqual, L_exit_preLoop);
2783    __ cmpptr(len_reg, 0);
2784    __ jcc(Assembler::lessEqual, L_exit_preLoop);
2785    __ movb(rax, Address(saved_encCounter_start, used));
2786    __ xorb(rax, Address(from, 0));
2787    __ movb(Address(to, 0), rax);
2788    __ addptr(from, 1);
2789    __ addptr(to, 1);
2790    __ addptr(used, 1);
2791    __ subptr(len_reg, 1);
2792
2793    __ jmp(L_preLoop_start);
2794
2795    __ BIND(L_exit_preLoop);
2796    __ movptr(used_addr, used_addr_param);
2797    __ movptr(used_addr, used_addr_param);
2798    __ movl(Address(used_addr, 0), used);
2799
2800    // load the parameters 'key' and 'counter'
2801    __ movptr(key, key_param);
2802    __ movptr(counter, rvec_param);
2803
2804    // xmm register assignments for the loops below
2805    const XMMRegister xmm_curr_counter      = xmm0;
2806    const XMMRegister xmm_counter_shuf_mask = xmm1;  // need to be reloaded
2807    const XMMRegister xmm_key_shuf_mask     = xmm2;  // need to be reloaded
2808    const XMMRegister xmm_key               = xmm3;
2809    const XMMRegister xmm_result0           = xmm4;
2810    const XMMRegister xmm_result1           = xmm5;
2811    const XMMRegister xmm_result2           = xmm6;
2812    const XMMRegister xmm_result3           = xmm7;
2813    const XMMRegister xmm_from0             = xmm1;   //reuse XMM register
2814    const XMMRegister xmm_from1             = xmm2;
2815    const XMMRegister xmm_from2             = xmm3;
2816    const XMMRegister xmm_from3             = xmm4;
2817
2818    //for key_128, key_192, key_256
2819    const int rounds[3] = {10, 12, 14};
2820    Label L_singleBlockLoopTop[3];
2821    Label L_multiBlock_loopTop[3];
2822    Label L_key192_top, L_key256_top;
2823    Label L_incCounter[3][4]; // 3: different key length,  4: 4 blocks at a time
2824    Label L_incCounter_single[3]; //for single block, key128, key192, key256
2825    Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
2826    Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
2827
2828    Label L_exit;
2829    const int PARALLEL_FACTOR = 4;  //because of the limited register number
2830
2831    // initialize counter with initial counter
2832    __ movdqu(xmm_curr_counter, Address(counter, 0x00));
2833    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2834    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled for increase
2835
2836    // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2837    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2838    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2839    __ cmpl(rax, 52);
2840    __ jcc(Assembler::equal, L_key192_top);
2841    __ cmpl(rax, 60);
2842    __ jcc(Assembler::equal, L_key256_top);
2843
2844    //key128 begins here
2845    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
2846
2847#define CTR_DoFour(opc, src_reg)               \
2848    __ opc(xmm_result0, src_reg);              \
2849    __ opc(xmm_result1, src_reg);              \
2850    __ opc(xmm_result2, src_reg);              \
2851    __ opc(xmm_result3, src_reg);
2852
2853    // k == 0 :  generate code for key_128
2854    // k == 1 :  generate code for key_192
2855    // k == 2 :  generate code for key_256
2856    for (int k = 0; k < 3; ++k) {
2857      //multi blocks starts here
2858      __ align(OptoLoopAlignment);
2859      __ BIND(L_multiBlock_loopTop[k]);
2860      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
2861      __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
2862
2863      __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2864      __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2865
2866      //load, then increase counters
2867      CTR_DoFour(movdqa, xmm_curr_counter);
2868      __ push(rbx);
2869      inc_counter(rbx, xmm_result1, 0x01, L_incCounter[k][0]);
2870      inc_counter(rbx, xmm_result2, 0x02, L_incCounter[k][1]);
2871      inc_counter(rbx, xmm_result3, 0x03, L_incCounter[k][2]);
2872      inc_counter(rbx, xmm_curr_counter, 0x04, L_incCounter[k][3]);
2873      __ pop (rbx);
2874
2875      load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); // load Round 0 key. interleaving for better performance
2876
2877      CTR_DoFour(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
2878      CTR_DoFour(pxor, xmm_key);   //PXOR with Round 0 key
2879
2880      for (int i = 1; i < rounds[k]; ++i) {
2881        load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
2882        CTR_DoFour(aesenc, xmm_key);
2883      }
2884      load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
2885      CTR_DoFour(aesenclast, xmm_key);
2886
2887      // get next PARALLEL_FACTOR blocks into xmm_from registers
2888      __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2889      __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2890      __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2891
2892      // PXOR with input text
2893      __ pxor(xmm_result0, xmm_from0); //result0 is xmm4
2894      __ pxor(xmm_result1, xmm_from1);
2895      __ pxor(xmm_result2, xmm_from2);
2896
2897      // store PARALLEL_FACTOR results into the next 64 bytes of output
2898      __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2899      __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
2900      __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
2901
2902      // do it here after xmm_result0 is saved, because xmm_from3 reuse the same register of xmm_result0.
2903      __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
2904      __ pxor(xmm_result3, xmm_from3);
2905      __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
2906
2907      __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
2908      __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
2909      __ jmp(L_multiBlock_loopTop[k]);
2910
2911      // singleBlock starts here
2912      __ align(OptoLoopAlignment);
2913      __ BIND(L_singleBlockLoopTop[k]);
2914      __ cmpptr(len_reg, 0);
2915      __ jcc(Assembler::equal, L_exit);
2916      __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2917      __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2918      __ movdqa(xmm_result0, xmm_curr_counter);
2919      load_key(xmm_key, key, 0x00, xmm_key_shuf_mask);
2920      __ push(rbx);//rbx is used for increasing counter
2921      inc_counter(rbx, xmm_curr_counter, 0x01, L_incCounter_single[k]);
2922      __ pop (rbx);
2923      __ pshufb(xmm_result0, xmm_counter_shuf_mask);
2924      __ pxor(xmm_result0, xmm_key);
2925      for (int i = 1; i < rounds[k]; i++) {
2926        load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
2927        __ aesenc(xmm_result0, xmm_key);
2928      }
2929      load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
2930      __ aesenclast(xmm_result0, xmm_key);
2931      __ cmpptr(len_reg, AESBlockSize);
2932      __ jcc(Assembler::less, L_processTail_insr[k]);
2933        __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2934        __ pxor(xmm_result0, xmm_from0);
2935        __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2936        __ addptr(pos, AESBlockSize);
2937        __ subptr(len_reg, AESBlockSize);
2938        __ jmp(L_singleBlockLoopTop[k]);
2939
2940      __ BIND(L_processTail_insr[k]);                                               // Process the tail part of the input array
2941        __ addptr(pos, len_reg);                                                    // 1. Insert bytes from src array into xmm_from0 register
2942        __ testptr(len_reg, 8);
2943        __ jcc(Assembler::zero, L_processTail_4_insr[k]);
2944          __ subptr(pos,8);
2945          __ pinsrd(xmm_from0, Address(from, pos), 0);
2946          __ pinsrd(xmm_from0, Address(from, pos, Address::times_1, 4), 1);
2947        __ BIND(L_processTail_4_insr[k]);
2948        __ testptr(len_reg, 4);
2949        __ jcc(Assembler::zero, L_processTail_2_insr[k]);
2950          __ subptr(pos,4);
2951          __ pslldq(xmm_from0, 4);
2952          __ pinsrd(xmm_from0, Address(from, pos), 0);
2953        __ BIND(L_processTail_2_insr[k]);
2954        __ testptr(len_reg, 2);
2955        __ jcc(Assembler::zero, L_processTail_1_insr[k]);
2956          __ subptr(pos, 2);
2957          __ pslldq(xmm_from0, 2);
2958          __ pinsrw(xmm_from0, Address(from, pos), 0);
2959        __ BIND(L_processTail_1_insr[k]);
2960        __ testptr(len_reg, 1);
2961        __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
2962          __ subptr(pos, 1);
2963          __ pslldq(xmm_from0, 1);
2964          __ pinsrb(xmm_from0, Address(from, pos), 0);
2965        __ BIND(L_processTail_exit_insr[k]);
2966
2967        __ movptr(saved_encCounter_start, saved_counter_param);
2968        __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);               // 2. Perform pxor of the encrypted counter and plaintext Bytes.
2969        __ pxor(xmm_result0, xmm_from0);                                          //    Also the encrypted counter is saved for next invocation.
2970
2971        __ testptr(len_reg, 8);
2972        __ jcc(Assembler::zero, L_processTail_4_extr[k]);                        // 3. Extract bytes from xmm_result0 into the dest. array
2973          __ pextrd(Address(to, pos), xmm_result0, 0);
2974          __ pextrd(Address(to, pos, Address::times_1, 4), xmm_result0, 1);
2975          __ psrldq(xmm_result0, 8);
2976          __ addptr(pos, 8);
2977        __ BIND(L_processTail_4_extr[k]);
2978        __ testptr(len_reg, 4);
2979        __ jcc(Assembler::zero, L_processTail_2_extr[k]);
2980          __ pextrd(Address(to, pos), xmm_result0, 0);
2981          __ psrldq(xmm_result0, 4);
2982          __ addptr(pos, 4);
2983        __ BIND(L_processTail_2_extr[k]);
2984        __ testptr(len_reg, 2);
2985        __ jcc(Assembler::zero, L_processTail_1_extr[k]);
2986          __ pextrb(Address(to, pos), xmm_result0, 0);
2987          __ pextrb(Address(to, pos, Address::times_1, 1), xmm_result0, 1);
2988          __ psrldq(xmm_result0, 2);
2989          __ addptr(pos, 2);
2990        __ BIND(L_processTail_1_extr[k]);
2991        __ testptr(len_reg, 1);
2992        __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
2993          __ pextrb(Address(to, pos), xmm_result0, 0);
2994
2995        __ BIND(L_processTail_exit_extr[k]);
2996        __ movptr(used_addr, used_addr_param);
2997        __ movl(Address(used_addr, 0), len_reg);
2998        __ jmp(L_exit);
2999    }
3000
3001    __ BIND(L_exit);
3002    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
3003    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
3004    __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
3005    handleSOERegisters(false /*restoring*/);
3006    __ movptr(rax, len_param); // return length
3007    __ leave();                // required for proper stackwalking of RuntimeStub frame
3008    __ ret(0);
3009
3010    __ BIND (L_key192_top);
3011    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
3012    __ jmp(L_multiBlock_loopTop[1]); //key192
3013
3014    __ BIND (L_key256_top);
3015    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
3016    __ jmp(L_multiBlock_loopTop[2]); //key192
3017
3018    return start;
3019  }
3020
3021  address generate_upper_word_mask() {
3022    __ align(64);
3023    StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3024    address start = __ pc();
3025    __ emit_data(0x00000000, relocInfo::none, 0);
3026    __ emit_data(0x00000000, relocInfo::none, 0);
3027    __ emit_data(0x00000000, relocInfo::none, 0);
3028    __ emit_data(0xFFFFFFFF, relocInfo::none, 0);
3029    return start;
3030  }
3031
3032  address generate_shuffle_byte_flip_mask() {
3033    __ align(64);
3034    StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3035    address start = __ pc();
3036    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3037    __ emit_data(0x08090a0b, relocInfo::none, 0);
3038    __ emit_data(0x04050607, relocInfo::none, 0);
3039    __ emit_data(0x00010203, relocInfo::none, 0);
3040    return start;
3041  }
3042
3043  // ofs and limit are use for multi-block byte array.
3044  // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3045  address generate_sha1_implCompress(bool multi_block, const char *name) {
3046    __ align(CodeEntryAlignment);
3047    StubCodeMark mark(this, "StubRoutines", name);
3048    address start = __ pc();
3049
3050    Register buf   = rax;
3051    Register state = rdx;
3052    Register ofs   = rcx;
3053    Register limit = rdi;
3054
3055    const Address  buf_param(rbp, 8 + 0);
3056    const Address  state_param(rbp, 8 + 4);
3057    const Address  ofs_param(rbp, 8 + 8);
3058    const Address  limit_param(rbp, 8 + 12);
3059
3060    const XMMRegister abcd = xmm0;
3061    const XMMRegister e0 = xmm1;
3062    const XMMRegister e1 = xmm2;
3063    const XMMRegister msg0 = xmm3;
3064
3065    const XMMRegister msg1 = xmm4;
3066    const XMMRegister msg2 = xmm5;
3067    const XMMRegister msg3 = xmm6;
3068    const XMMRegister shuf_mask = xmm7;
3069
3070    __ enter();
3071    __ subptr(rsp, 8 * wordSize);
3072    if (multi_block) {
3073      __ push(limit);
3074    }
3075    __ movptr(buf, buf_param);
3076    __ movptr(state, state_param);
3077    if (multi_block) {
3078      __ movptr(ofs, ofs_param);
3079      __ movptr(limit, limit_param);
3080    }
3081
3082    __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3083      buf, state, ofs, limit, rsp, multi_block);
3084
3085    if (multi_block) {
3086      __ pop(limit);
3087    }
3088    __ addptr(rsp, 8 * wordSize);
3089    __ leave();
3090    __ ret(0);
3091    return start;
3092  }
3093
3094  address generate_pshuffle_byte_flip_mask() {
3095    __ align(64);
3096    StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3097    address start = __ pc();
3098    __ emit_data(0x00010203, relocInfo::none, 0);
3099    __ emit_data(0x04050607, relocInfo::none, 0);
3100    __ emit_data(0x08090a0b, relocInfo::none, 0);
3101    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3102    return start;
3103  }
3104
3105  // ofs and limit are use for multi-block byte array.
3106  // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3107 address generate_sha256_implCompress(bool multi_block, const char *name) {
3108    __ align(CodeEntryAlignment);
3109    StubCodeMark mark(this, "StubRoutines", name);
3110    address start = __ pc();
3111
3112    Register buf = rbx;
3113    Register state = rsi;
3114    Register ofs = rdx;
3115    Register limit = rcx;
3116
3117    const Address  buf_param(rbp, 8 + 0);
3118    const Address  state_param(rbp, 8 + 4);
3119    const Address  ofs_param(rbp, 8 + 8);
3120    const Address  limit_param(rbp, 8 + 12);
3121
3122    const XMMRegister msg = xmm0;
3123    const XMMRegister state0 = xmm1;
3124    const XMMRegister state1 = xmm2;
3125    const XMMRegister msgtmp0 = xmm3;
3126
3127    const XMMRegister msgtmp1 = xmm4;
3128    const XMMRegister msgtmp2 = xmm5;
3129    const XMMRegister msgtmp3 = xmm6;
3130    const XMMRegister msgtmp4 = xmm7;
3131
3132    __ enter();
3133    __ subptr(rsp, 8 * wordSize);
3134    handleSOERegisters(true /*saving*/);
3135    __ movptr(buf, buf_param);
3136    __ movptr(state, state_param);
3137    if (multi_block) {
3138     __ movptr(ofs, ofs_param);
3139     __ movptr(limit, limit_param);
3140    }
3141
3142    __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3143      buf, state, ofs, limit, rsp, multi_block);
3144
3145    handleSOERegisters(false);
3146    __ addptr(rsp, 8 * wordSize);
3147    __ leave();
3148    __ ret(0);
3149    return start;
3150  }
3151
3152  // byte swap x86 long
3153  address generate_ghash_long_swap_mask() {
3154    __ align(CodeEntryAlignment);
3155    StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
3156    address start = __ pc();
3157    __ emit_data(0x0b0a0908, relocInfo::none, 0);
3158    __ emit_data(0x0f0e0d0c, relocInfo::none, 0);
3159    __ emit_data(0x03020100, relocInfo::none, 0);
3160    __ emit_data(0x07060504, relocInfo::none, 0);
3161
3162  return start;
3163  }
3164
3165  // byte swap x86 byte array
3166  address generate_ghash_byte_swap_mask() {
3167    __ align(CodeEntryAlignment);
3168    StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
3169    address start = __ pc();
3170    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3171    __ emit_data(0x08090a0b, relocInfo::none, 0);
3172    __ emit_data(0x04050607, relocInfo::none, 0);
3173    __ emit_data(0x00010203, relocInfo::none, 0);
3174  return start;
3175  }
3176
3177  /* Single and multi-block ghash operations */
3178  address generate_ghash_processBlocks() {
3179    assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
3180    __ align(CodeEntryAlignment);
3181    Label L_ghash_loop, L_exit;
3182    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3183    address start = __ pc();
3184
3185    const Register state        = rdi;
3186    const Register subkeyH      = rsi;
3187    const Register data         = rdx;
3188    const Register blocks       = rcx;
3189
3190    const Address  state_param(rbp, 8+0);
3191    const Address  subkeyH_param(rbp, 8+4);
3192    const Address  data_param(rbp, 8+8);
3193    const Address  blocks_param(rbp, 8+12);
3194
3195    const XMMRegister xmm_temp0 = xmm0;
3196    const XMMRegister xmm_temp1 = xmm1;
3197    const XMMRegister xmm_temp2 = xmm2;
3198    const XMMRegister xmm_temp3 = xmm3;
3199    const XMMRegister xmm_temp4 = xmm4;
3200    const XMMRegister xmm_temp5 = xmm5;
3201    const XMMRegister xmm_temp6 = xmm6;
3202    const XMMRegister xmm_temp7 = xmm7;
3203
3204    __ enter();
3205    handleSOERegisters(true);  // Save registers
3206
3207    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3208    // context for the registers used, where all instructions below are using 128-bit mode
3209    // On EVEX without VL and BW, these instructions will all be AVX.
3210    if (VM_Version::supports_avx512vlbw()) {
3211      __ movl(rdx, 0xffff);
3212      __ kmovdl(k1, rdx);
3213    }
3214
3215    __ movptr(state, state_param);
3216    __ movptr(subkeyH, subkeyH_param);
3217    __ movptr(data, data_param);
3218    __ movptr(blocks, blocks_param);
3219
3220    __ movdqu(xmm_temp0, Address(state, 0));
3221    __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3222
3223    __ movdqu(xmm_temp1, Address(subkeyH, 0));
3224    __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3225
3226    __ BIND(L_ghash_loop);
3227    __ movdqu(xmm_temp2, Address(data, 0));
3228    __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
3229
3230    __ pxor(xmm_temp0, xmm_temp2);
3231
3232    //
3233    // Multiply with the hash key
3234    //
3235    __ movdqu(xmm_temp3, xmm_temp0);
3236    __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
3237    __ movdqu(xmm_temp4, xmm_temp0);
3238    __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
3239
3240    __ movdqu(xmm_temp5, xmm_temp0);
3241    __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
3242    __ movdqu(xmm_temp6, xmm_temp0);
3243    __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
3244
3245    __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
3246
3247    __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
3248    __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
3249    __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
3250    __ pxor(xmm_temp3, xmm_temp5);
3251    __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
3252                                        // of the carry-less multiplication of
3253                                        // xmm0 by xmm1.
3254
3255    // We shift the result of the multiplication by one bit position
3256    // to the left to cope for the fact that the bits are reversed.
3257    __ movdqu(xmm_temp7, xmm_temp3);
3258    __ movdqu(xmm_temp4, xmm_temp6);
3259    __ pslld (xmm_temp3, 1);
3260    __ pslld(xmm_temp6, 1);
3261    __ psrld(xmm_temp7, 31);
3262    __ psrld(xmm_temp4, 31);
3263    __ movdqu(xmm_temp5, xmm_temp7);
3264    __ pslldq(xmm_temp4, 4);
3265    __ pslldq(xmm_temp7, 4);
3266    __ psrldq(xmm_temp5, 12);
3267    __ por(xmm_temp3, xmm_temp7);
3268    __ por(xmm_temp6, xmm_temp4);
3269    __ por(xmm_temp6, xmm_temp5);
3270
3271    //
3272    // First phase of the reduction
3273    //
3274    // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts
3275    // independently.
3276    __ movdqu(xmm_temp7, xmm_temp3);
3277    __ movdqu(xmm_temp4, xmm_temp3);
3278    __ movdqu(xmm_temp5, xmm_temp3);
3279    __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
3280    __ pslld(xmm_temp4, 30);    // packed right shift shifting << 30
3281    __ pslld(xmm_temp5, 25);    // packed right shift shifting << 25
3282    __ pxor(xmm_temp7, xmm_temp4);      // xor the shifted versions
3283    __ pxor(xmm_temp7, xmm_temp5);
3284    __ movdqu(xmm_temp4, xmm_temp7);
3285    __ pslldq(xmm_temp7, 12);
3286    __ psrldq(xmm_temp4, 4);
3287    __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
3288
3289    //
3290    // Second phase of the reduction
3291    //
3292    // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these
3293    // shift operations.
3294    __ movdqu(xmm_temp2, xmm_temp3);
3295    __ movdqu(xmm_temp7, xmm_temp3);
3296    __ movdqu(xmm_temp5, xmm_temp3);
3297    __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
3298    __ psrld(xmm_temp7, 2);     // packed left shifting >> 2
3299    __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
3300    __ pxor(xmm_temp2, xmm_temp7);      // xor the shifted versions
3301    __ pxor(xmm_temp2, xmm_temp5);
3302    __ pxor(xmm_temp2, xmm_temp4);
3303    __ pxor(xmm_temp3, xmm_temp2);
3304    __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
3305
3306    __ decrement(blocks);
3307    __ jcc(Assembler::zero, L_exit);
3308    __ movdqu(xmm_temp0, xmm_temp6);
3309    __ addptr(data, 16);
3310    __ jmp(L_ghash_loop);
3311
3312    __ BIND(L_exit);
3313       // Byte swap 16-byte result
3314    __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3315    __ movdqu(Address(state, 0), xmm_temp6);   // store the result
3316
3317    handleSOERegisters(false);  // restore registers
3318    __ leave();
3319    __ ret(0);
3320    return start;
3321  }
3322
3323  /**
3324   *  Arguments:
3325   *
3326   * Inputs:
3327   *   rsp(4)   - int crc
3328   *   rsp(8)   - byte* buf
3329   *   rsp(12)  - int length
3330   *
3331   * Ouput:
3332   *       rax   - int crc result
3333   */
3334  address generate_updateBytesCRC32() {
3335    assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
3336
3337    __ align(CodeEntryAlignment);
3338    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3339
3340    address start = __ pc();
3341
3342    const Register crc   = rdx;  // crc
3343    const Register buf   = rsi;  // source java byte array address
3344    const Register len   = rcx;  // length
3345    const Register table = rdi;  // crc_table address (reuse register)
3346    const Register tmp   = rbx;
3347    assert_different_registers(crc, buf, len, table, tmp, rax);
3348
3349    BLOCK_COMMENT("Entry:");
3350    __ enter(); // required for proper stackwalking of RuntimeStub frame
3351    __ push(rsi);
3352    __ push(rdi);
3353    __ push(rbx);
3354
3355    Address crc_arg(rbp, 8 + 0);
3356    Address buf_arg(rbp, 8 + 4);
3357    Address len_arg(rbp, 8 + 8);
3358
3359    // Load up:
3360    __ movl(crc,   crc_arg);
3361    __ movptr(buf, buf_arg);
3362    __ movl(len,   len_arg);
3363
3364    __ kernel_crc32(crc, buf, len, table, tmp);
3365
3366    __ movl(rax, crc);
3367    __ pop(rbx);
3368    __ pop(rdi);
3369    __ pop(rsi);
3370    __ vzeroupper();
3371    __ leave(); // required for proper stackwalking of RuntimeStub frame
3372    __ ret(0);
3373
3374    return start;
3375  }
3376
3377  /**
3378  *  Arguments:
3379  *
3380  * Inputs:
3381  *   rsp(4)   - int crc
3382  *   rsp(8)   - byte* buf
3383  *   rsp(12)  - int length
3384  *   rsp(16)  - table_start - optional (present only when doing a library_calll,
3385  *              not used by x86 algorithm)
3386  *
3387  * Ouput:
3388  *       rax  - int crc result
3389  */
3390  address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
3391    assert(UseCRC32CIntrinsics, "need SSE4_2");
3392    __ align(CodeEntryAlignment);
3393    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3394    address start = __ pc();
3395    const Register crc = rax;  // crc
3396    const Register buf = rcx;  // source java byte array address
3397    const Register len = rdx;  // length
3398    const Register d = rbx;
3399    const Register g = rsi;
3400    const Register h = rdi;
3401    const Register empty = 0; // will never be used, in order not
3402                              // to change a signature for crc32c_IPL_Alg2_Alt2
3403                              // between 64/32 I'm just keeping it here
3404    assert_different_registers(crc, buf, len, d, g, h);
3405
3406    BLOCK_COMMENT("Entry:");
3407    __ enter(); // required for proper stackwalking of RuntimeStub frame
3408    Address crc_arg(rsp, 4 + 4 + 0); // ESP+4 +
3409                                     // we need to add additional 4 because __ enter
3410                                     // have just pushed ebp on a stack
3411    Address buf_arg(rsp, 4 + 4 + 4);
3412    Address len_arg(rsp, 4 + 4 + 8);
3413      // Load up:
3414      __ movl(crc, crc_arg);
3415      __ movl(buf, buf_arg);
3416      __ movl(len, len_arg);
3417      __ push(d);
3418      __ push(g);
3419      __ push(h);
3420      __ crc32c_ipl_alg2_alt2(crc, buf, len,
3421                              d, g, h,
3422                              empty, empty, empty,
3423                              xmm0, xmm1, xmm2,
3424                              is_pclmulqdq_supported);
3425      __ pop(h);
3426      __ pop(g);
3427      __ pop(d);
3428    __ vzeroupper();
3429    __ leave(); // required for proper stackwalking of RuntimeStub frame
3430    __ ret(0);
3431
3432    return start;
3433  }
3434
3435 address generate_libmExp() {
3436    address start = __ pc();
3437
3438    const XMMRegister x0  = xmm0;
3439    const XMMRegister x1  = xmm1;
3440    const XMMRegister x2  = xmm2;
3441    const XMMRegister x3  = xmm3;
3442
3443    const XMMRegister x4  = xmm4;
3444    const XMMRegister x5  = xmm5;
3445    const XMMRegister x6  = xmm6;
3446    const XMMRegister x7  = xmm7;
3447
3448    const Register tmp   = rbx;
3449
3450    BLOCK_COMMENT("Entry:");
3451    __ enter(); // required for proper stackwalking of RuntimeStub frame
3452    __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3453    __ leave(); // required for proper stackwalking of RuntimeStub frame
3454    __ ret(0);
3455
3456    return start;
3457
3458  }
3459
3460 address generate_libmLog() {
3461   address start = __ pc();
3462
3463   const XMMRegister x0 = xmm0;
3464   const XMMRegister x1 = xmm1;
3465   const XMMRegister x2 = xmm2;
3466   const XMMRegister x3 = xmm3;
3467
3468   const XMMRegister x4 = xmm4;
3469   const XMMRegister x5 = xmm5;
3470   const XMMRegister x6 = xmm6;
3471   const XMMRegister x7 = xmm7;
3472
3473   const Register tmp = rbx;
3474
3475   BLOCK_COMMENT("Entry:");
3476   __ enter(); // required for proper stackwalking of RuntimeStub frame
3477   __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3478   __ leave(); // required for proper stackwalking of RuntimeStub frame
3479   __ ret(0);
3480
3481   return start;
3482
3483 }
3484
3485 address generate_libmLog10() {
3486   address start = __ pc();
3487
3488   const XMMRegister x0 = xmm0;
3489   const XMMRegister x1 = xmm1;
3490   const XMMRegister x2 = xmm2;
3491   const XMMRegister x3 = xmm3;
3492
3493   const XMMRegister x4 = xmm4;
3494   const XMMRegister x5 = xmm5;
3495   const XMMRegister x6 = xmm6;
3496   const XMMRegister x7 = xmm7;
3497
3498   const Register tmp = rbx;
3499
3500   BLOCK_COMMENT("Entry:");
3501   __ enter(); // required for proper stackwalking of RuntimeStub frame
3502   __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3503   __ leave(); // required for proper stackwalking of RuntimeStub frame
3504   __ ret(0);
3505
3506   return start;
3507
3508 }
3509
3510 address generate_libmPow() {
3511   address start = __ pc();
3512
3513   const XMMRegister x0 = xmm0;
3514   const XMMRegister x1 = xmm1;
3515   const XMMRegister x2 = xmm2;
3516   const XMMRegister x3 = xmm3;
3517
3518   const XMMRegister x4 = xmm4;
3519   const XMMRegister x5 = xmm5;
3520   const XMMRegister x6 = xmm6;
3521   const XMMRegister x7 = xmm7;
3522
3523   const Register tmp = rbx;
3524
3525   BLOCK_COMMENT("Entry:");
3526   __ enter(); // required for proper stackwalking of RuntimeStub frame
3527   __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3528   __ leave(); // required for proper stackwalking of RuntimeStub frame
3529   __ ret(0);
3530
3531   return start;
3532
3533 }
3534
3535 address generate_libm_reduce_pi04l() {
3536   address start = __ pc();
3537
3538   BLOCK_COMMENT("Entry:");
3539   __ libm_reduce_pi04l(rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3540
3541   return start;
3542
3543 }
3544
3545 address generate_libm_sin_cos_huge() {
3546   address start = __ pc();
3547
3548   const XMMRegister x0 = xmm0;
3549   const XMMRegister x1 = xmm1;
3550
3551   BLOCK_COMMENT("Entry:");
3552   __ libm_sincos_huge(x0, x1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3553
3554   return start;
3555
3556 }
3557
3558 address generate_libmSin() {
3559   address start = __ pc();
3560
3561   const XMMRegister x0 = xmm0;
3562   const XMMRegister x1 = xmm1;
3563   const XMMRegister x2 = xmm2;
3564   const XMMRegister x3 = xmm3;
3565
3566   const XMMRegister x4 = xmm4;
3567   const XMMRegister x5 = xmm5;
3568   const XMMRegister x6 = xmm6;
3569   const XMMRegister x7 = xmm7;
3570
3571   BLOCK_COMMENT("Entry:");
3572   __ enter(); // required for proper stackwalking of RuntimeStub frame
3573   __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rdx);
3574   __ leave(); // required for proper stackwalking of RuntimeStub frame
3575   __ ret(0);
3576
3577   return start;
3578
3579 }
3580
3581 address generate_libmCos() {
3582   address start = __ pc();
3583
3584   const XMMRegister x0 = xmm0;
3585   const XMMRegister x1 = xmm1;
3586   const XMMRegister x2 = xmm2;
3587   const XMMRegister x3 = xmm3;
3588
3589   const XMMRegister x4 = xmm4;
3590   const XMMRegister x5 = xmm5;
3591   const XMMRegister x6 = xmm6;
3592   const XMMRegister x7 = xmm7;
3593
3594   const Register tmp = rbx;
3595
3596   BLOCK_COMMENT("Entry:");
3597   __ enter(); // required for proper stackwalking of RuntimeStub frame
3598   __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3599   __ leave(); // required for proper stackwalking of RuntimeStub frame
3600   __ ret(0);
3601
3602   return start;
3603
3604 }
3605
3606 address generate_libm_tan_cot_huge() {
3607   address start = __ pc();
3608
3609   const XMMRegister x0 = xmm0;
3610   const XMMRegister x1 = xmm1;
3611
3612   BLOCK_COMMENT("Entry:");
3613   __ libm_tancot_huge(x0, x1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3614
3615   return start;
3616
3617 }
3618
3619 address generate_libmTan() {
3620   address start = __ pc();
3621
3622   const XMMRegister x0 = xmm0;
3623   const XMMRegister x1 = xmm1;
3624   const XMMRegister x2 = xmm2;
3625   const XMMRegister x3 = xmm3;
3626
3627   const XMMRegister x4 = xmm4;
3628   const XMMRegister x5 = xmm5;
3629   const XMMRegister x6 = xmm6;
3630   const XMMRegister x7 = xmm7;
3631
3632   const Register tmp = rbx;
3633
3634   BLOCK_COMMENT("Entry:");
3635   __ enter(); // required for proper stackwalking of RuntimeStub frame
3636   __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3637   __ leave(); // required for proper stackwalking of RuntimeStub frame
3638   __ ret(0);
3639
3640   return start;
3641
3642 }
3643
3644  // Safefetch stubs.
3645  void generate_safefetch(const char* name, int size, address* entry,
3646                          address* fault_pc, address* continuation_pc) {
3647    // safefetch signatures:
3648    //   int      SafeFetch32(int*      adr, int      errValue);
3649    //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3650
3651    StubCodeMark mark(this, "StubRoutines", name);
3652
3653    // Entry point, pc or function descriptor.
3654    *entry = __ pc();
3655
3656    __ movl(rax, Address(rsp, 0x8));
3657    __ movl(rcx, Address(rsp, 0x4));
3658    // Load *adr into eax, may fault.
3659    *fault_pc = __ pc();
3660    switch (size) {
3661      case 4:
3662        // int32_t
3663        __ movl(rax, Address(rcx, 0));
3664        break;
3665      case 8:
3666        // int64_t
3667        Unimplemented();
3668        break;
3669      default:
3670        ShouldNotReachHere();
3671    }
3672
3673    // Return errValue or *adr.
3674    *continuation_pc = __ pc();
3675    __ ret(0);
3676  }
3677
3678 public:
3679  // Information about frame layout at time of blocking runtime call.
3680  // Note that we only have to preserve callee-saved registers since
3681  // the compilers are responsible for supplying a continuation point
3682  // if they expect all registers to be preserved.
3683  enum layout {
3684    thread_off,    // last_java_sp
3685    arg1_off,
3686    arg2_off,
3687    rbp_off,       // callee saved register
3688    ret_pc,
3689    framesize
3690  };
3691
3692 private:
3693
3694#undef  __
3695#define __ masm->
3696
3697  //------------------------------------------------------------------------------------------------------------------------
3698  // Continuation point for throwing of implicit exceptions that are not handled in
3699  // the current activation. Fabricates an exception oop and initiates normal
3700  // exception dispatching in this frame.
3701  //
3702  // Previously the compiler (c2) allowed for callee save registers on Java calls.
3703  // This is no longer true after adapter frames were removed but could possibly
3704  // be brought back in the future if the interpreter code was reworked and it
3705  // was deemed worthwhile. The comment below was left to describe what must
3706  // happen here if callee saves were resurrected. As it stands now this stub
3707  // could actually be a vanilla BufferBlob and have now oopMap at all.
3708  // Since it doesn't make much difference we've chosen to leave it the
3709  // way it was in the callee save days and keep the comment.
3710
3711  // If we need to preserve callee-saved values we need a callee-saved oop map and
3712  // therefore have to make these stubs into RuntimeStubs rather than BufferBlobs.
3713  // If the compiler needs all registers to be preserved between the fault
3714  // point and the exception handler then it must assume responsibility for that in
3715  // AbstractCompiler::continuation_for_implicit_null_exception or
3716  // continuation_for_implicit_division_by_zero_exception. All other implicit
3717  // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
3718  // either at call sites or otherwise assume that stack unwinding will be initiated,
3719  // so caller saved registers were assumed volatile in the compiler.
3720  address generate_throw_exception(const char* name, address runtime_entry,
3721                                   Register arg1 = noreg, Register arg2 = noreg) {
3722
3723    int insts_size = 256;
3724    int locs_size  = 32;
3725
3726    CodeBuffer code(name, insts_size, locs_size);
3727    OopMapSet* oop_maps  = new OopMapSet();
3728    MacroAssembler* masm = new MacroAssembler(&code);
3729
3730    address start = __ pc();
3731
3732    // This is an inlined and slightly modified version of call_VM
3733    // which has the ability to fetch the return PC out of
3734    // thread-local storage and also sets up last_Java_sp slightly
3735    // differently than the real call_VM
3736    Register java_thread = rbx;
3737    __ get_thread(java_thread);
3738
3739    __ enter(); // required for proper stackwalking of RuntimeStub frame
3740
3741    // pc and rbp, already pushed
3742    __ subptr(rsp, (framesize-2) * wordSize); // prolog
3743
3744    // Frame is now completed as far as size and linkage.
3745
3746    int frame_complete = __ pc() - start;
3747
3748    // push java thread (becomes first argument of C function)
3749    __ movptr(Address(rsp, thread_off * wordSize), java_thread);
3750    if (arg1 != noreg) {
3751      __ movptr(Address(rsp, arg1_off * wordSize), arg1);
3752    }
3753    if (arg2 != noreg) {
3754      assert(arg1 != noreg, "missing reg arg");
3755      __ movptr(Address(rsp, arg2_off * wordSize), arg2);
3756    }
3757
3758    // Set up last_Java_sp and last_Java_fp
3759    __ set_last_Java_frame(java_thread, rsp, rbp, NULL);
3760
3761    // Call runtime
3762    BLOCK_COMMENT("call runtime_entry");
3763    __ call(RuntimeAddress(runtime_entry));
3764    // Generate oop map
3765    OopMap* map =  new OopMap(framesize, 0);
3766    oop_maps->add_gc_map(__ pc() - start, map);
3767
3768    // restore the thread (cannot use the pushed argument since arguments
3769    // may be overwritten by C code generated by an optimizing compiler);
3770    // however can use the register value directly if it is callee saved.
3771    __ get_thread(java_thread);
3772
3773    __ reset_last_Java_frame(java_thread, true);
3774
3775    __ leave(); // required for proper stackwalking of RuntimeStub frame
3776
3777    // check for pending exceptions
3778#ifdef ASSERT
3779    Label L;
3780    __ cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3781    __ jcc(Assembler::notEqual, L);
3782    __ should_not_reach_here();
3783    __ bind(L);
3784#endif /* ASSERT */
3785    __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3786
3787
3788    RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, framesize, oop_maps, false);
3789    return stub->entry_point();
3790  }
3791
3792
3793  void create_control_words() {
3794    // Round to nearest, 53-bit mode, exceptions masked
3795    StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
3796    // Round to zero, 53-bit mode, exception mased
3797    StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
3798    // Round to nearest, 24-bit mode, exceptions masked
3799    StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
3800    // Round to nearest, 64-bit mode, exceptions masked
3801    StubRoutines::_fpu_cntrl_wrd_64    = 0x037F;
3802    // Round to nearest, 64-bit mode, exceptions masked
3803    StubRoutines::_mxcsr_std           = 0x1F80;
3804    // Note: the following two constants are 80-bit values
3805    //       layout is critical for correct loading by FPU.
3806    // Bias for strict fp multiply/divide
3807    StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
3808    StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
3809    StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
3810    // Un-Bias for strict fp multiply/divide
3811    StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
3812    StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
3813    StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
3814  }
3815
3816  //---------------------------------------------------------------------------
3817  // Initialization
3818
3819  void generate_initial() {
3820    // Generates all stubs and initializes the entry points
3821
3822    //------------------------------------------------------------------------------------------------------------------------
3823    // entry points that exist in all platforms
3824    // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
3825    //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
3826    StubRoutines::_forward_exception_entry      = generate_forward_exception();
3827
3828    StubRoutines::_call_stub_entry              =
3829      generate_call_stub(StubRoutines::_call_stub_return_address);
3830    // is referenced by megamorphic call
3831    StubRoutines::_catch_exception_entry        = generate_catch_exception();
3832
3833    // These are currently used by Solaris/Intel
3834    StubRoutines::_atomic_xchg_entry            = generate_atomic_xchg();
3835
3836    // platform dependent
3837    create_control_words();
3838
3839    StubRoutines::x86::_verify_mxcsr_entry                 = generate_verify_mxcsr();
3840    StubRoutines::x86::_verify_fpu_cntrl_wrd_entry         = generate_verify_fpu_cntrl_wrd();
3841    StubRoutines::_d2i_wrapper                              = generate_d2i_wrapper(T_INT,
3842                                                                                   CAST_FROM_FN_PTR(address, SharedRuntime::d2i));
3843    StubRoutines::_d2l_wrapper                              = generate_d2i_wrapper(T_LONG,
3844                                                                                   CAST_FROM_FN_PTR(address, SharedRuntime::d2l));
3845
3846    // Build this early so it's available for the interpreter
3847    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",
3848                                                                                      CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
3849    StubRoutines::_throw_delayed_StackOverflowError_entry  = generate_throw_exception("delayed StackOverflowError throw_exception",
3850                                                                                      CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError));
3851
3852    if (UseCRC32Intrinsics) {
3853      // set table address before stub generation which use it
3854      StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
3855      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
3856    }
3857
3858    if (UseCRC32CIntrinsics) {
3859      bool supports_clmul = VM_Version::supports_clmul();
3860      StubRoutines::x86::generate_CRC32C_table(supports_clmul);
3861      StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
3862      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
3863    }
3864    if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
3865      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
3866          vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
3867          vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
3868        StubRoutines::x86::_L_2il0floatpacket_0_adr = (address)StubRoutines::x86::_L_2il0floatpacket_0;
3869        StubRoutines::x86::_Pi4Inv_adr = (address)StubRoutines::x86::_Pi4Inv;
3870        StubRoutines::x86::_Pi4x3_adr = (address)StubRoutines::x86::_Pi4x3;
3871        StubRoutines::x86::_Pi4x4_adr = (address)StubRoutines::x86::_Pi4x4;
3872        StubRoutines::x86::_ones_adr = (address)StubRoutines::x86::_ones;
3873      }
3874      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
3875        StubRoutines::_dexp = generate_libmExp();
3876      }
3877      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
3878        StubRoutines::_dlog = generate_libmLog();
3879      }
3880      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
3881        StubRoutines::_dlog10 = generate_libmLog10();
3882      }
3883      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
3884        StubRoutines::_dpow = generate_libmPow();
3885      }
3886      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
3887        vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
3888        vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
3889        StubRoutines::_dlibm_reduce_pi04l = generate_libm_reduce_pi04l();
3890      }
3891      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
3892        vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
3893        StubRoutines::_dlibm_sin_cos_huge = generate_libm_sin_cos_huge();
3894      }
3895      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
3896        StubRoutines::_dsin = generate_libmSin();
3897      }
3898      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
3899        StubRoutines::_dcos = generate_libmCos();
3900      }
3901      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
3902        StubRoutines::_dlibm_tan_cot_huge = generate_libm_tan_cot_huge();
3903        StubRoutines::_dtan = generate_libmTan();
3904      }
3905    }
3906  }
3907
3908  void generate_all() {
3909    // Generates all stubs and initializes the entry points
3910
3911    // These entry points require SharedInfo::stack0 to be set up in non-core builds
3912    // and need to be relocatable, so they each fabricate a RuntimeStub internally.
3913    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
3914    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
3915    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
3916
3917    //------------------------------------------------------------------------------------------------------------------------
3918    // entry points that are platform specific
3919
3920    // support for verify_oop (must happen after universe_init)
3921    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
3922
3923    // arraycopy stubs used by compilers
3924    generate_arraycopy_stubs();
3925
3926    // don't bother generating these AES intrinsic stubs unless global flag is set
3927    if (UseAESIntrinsics) {
3928      StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // might be needed by the others
3929
3930      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
3931      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
3932      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
3933      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
3934    }
3935
3936    if (UseAESCTRIntrinsics) {
3937      StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
3938      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
3939    }
3940
3941    if (UseSHA1Intrinsics) {
3942      StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
3943      StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
3944      StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
3945      StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
3946    }
3947    if (UseSHA256Intrinsics) {
3948      StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
3949      StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
3950      StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
3951      StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
3952    }
3953
3954    // Generate GHASH intrinsics code
3955    if (UseGHASHIntrinsics) {
3956      StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
3957      StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
3958      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
3959    }
3960
3961    // Safefetch stubs.
3962    generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
3963                                                   &StubRoutines::_safefetch32_fault_pc,
3964                                                   &StubRoutines::_safefetch32_continuation_pc);
3965    StubRoutines::_safefetchN_entry           = StubRoutines::_safefetch32_entry;
3966    StubRoutines::_safefetchN_fault_pc        = StubRoutines::_safefetch32_fault_pc;
3967    StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc;
3968  }
3969
3970
3971 public:
3972  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3973    if (all) {
3974      generate_all();
3975    } else {
3976      generate_initial();
3977    }
3978  }
3979}; // end class declaration
3980
3981
3982void StubGenerator_generate(CodeBuffer* code, bool all) {
3983  StubGenerator g(code, all);
3984}
3985