sharedRuntime_x86_64.cpp revision 13244:ebbb31f0437e
1/*
2 * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#ifndef _WINDOWS
27#include "alloca.h"
28#endif
29#include "asm/macroAssembler.hpp"
30#include "asm/macroAssembler.inline.hpp"
31#include "code/debugInfoRec.hpp"
32#include "code/icBuffer.hpp"
33#include "code/vtableStubs.hpp"
34#include "interpreter/interpreter.hpp"
35#include "logging/log.hpp"
36#include "memory/resourceArea.hpp"
37#include "oops/compiledICHolder.hpp"
38#include "runtime/sharedRuntime.hpp"
39#include "runtime/vframeArray.hpp"
40#include "vmreg_x86.inline.hpp"
41#ifdef COMPILER1
42#include "c1/c1_Runtime1.hpp"
43#endif
44#ifdef COMPILER2
45#include "opto/runtime.hpp"
46#endif
47#if INCLUDE_JVMCI
48#include "jvmci/jvmciJavaClasses.hpp"
49#endif
50#include "vm_version_x86.hpp"
51
52#define __ masm->
53
54const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
55
56class SimpleRuntimeFrame {
57
58  public:
59
60  // Most of the runtime stubs have this simple frame layout.
61  // This class exists to make the layout shared in one place.
62  // Offsets are for compiler stack slots, which are jints.
63  enum layout {
64    // The frame sender code expects that rbp will be in the "natural" place and
65    // will override any oopMap setting for it. We must therefore force the layout
66    // so that it agrees with the frame sender code.
67    rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
68    rbp_off2,
69    return_off, return_off2,
70    framesize
71  };
72};
73
74class RegisterSaver {
75  // Capture info about frame layout.  Layout offsets are in jint
76  // units because compiler frame slots are jints.
77#define XSAVE_AREA_BEGIN 160
78#define XSAVE_AREA_YMM_BEGIN 576
79#define XSAVE_AREA_ZMM_BEGIN 1152
80#define XSAVE_AREA_UPPERBANK 1664
81#define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
82#define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
83#define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
84  enum layout {
85    fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
86    xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
87    DEF_XMM_OFFS(0),
88    DEF_XMM_OFFS(1),
89    // 2..15 are implied in range usage
90    ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
91    DEF_YMM_OFFS(0),
92    DEF_YMM_OFFS(1),
93    // 2..15 are implied in range usage
94    zmm_high = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
95    zmm_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
96    DEF_ZMM_OFFS(16),
97    DEF_ZMM_OFFS(17),
98    // 18..31 are implied in range usage
99    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
100    fpu_stateH_end,
101    r15_off, r15H_off,
102    r14_off, r14H_off,
103    r13_off, r13H_off,
104    r12_off, r12H_off,
105    r11_off, r11H_off,
106    r10_off, r10H_off,
107    r9_off,  r9H_off,
108    r8_off,  r8H_off,
109    rdi_off, rdiH_off,
110    rsi_off, rsiH_off,
111    ignore_off, ignoreH_off,  // extra copy of rbp
112    rsp_off, rspH_off,
113    rbx_off, rbxH_off,
114    rdx_off, rdxH_off,
115    rcx_off, rcxH_off,
116    rax_off, raxH_off,
117    // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
118    align_off, alignH_off,
119    flags_off, flagsH_off,
120    // The frame sender code expects that rbp will be in the "natural" place and
121    // will override any oopMap setting for it. We must therefore force the layout
122    // so that it agrees with the frame sender code.
123    rbp_off, rbpH_off,        // copy of rbp we will restore
124    return_off, returnH_off,  // slot for return address
125    reg_save_size             // size in compiler stack slots
126  };
127
128 public:
129  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors = false);
130  static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
131
132  // Offsets into the register save area
133  // Used by deoptimization when it is managing result register
134  // values on its own
135
136  static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
137  static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
138  static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
139  static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
140  static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
141
142  // During deoptimization only the result registers need to be restored,
143  // all the other values have already been extracted.
144  static void restore_result_registers(MacroAssembler* masm);
145};
146
147OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
148  int off = 0;
149  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
150  if (UseAVX < 3) {
151    num_xmm_regs = num_xmm_regs/2;
152  }
153#if defined(COMPILER2) || INCLUDE_JVMCI
154  if (save_vectors) {
155    assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
156    assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
157  }
158#else
159  assert(!save_vectors, "vectors are generated only by C2 and JVMCI");
160#endif
161
162  // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
163  int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
164  // OopMap frame size is in compiler stack slots (jint's) not bytes or words
165  int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
166  // CodeBlob frame size is in words.
167  int frame_size_in_words = frame_size_in_bytes / wordSize;
168  *total_frame_words = frame_size_in_words;
169
170  // Save registers, fpu state, and flags.
171  // We assume caller has already pushed the return address onto the
172  // stack, so rsp is 8-byte aligned here.
173  // We push rpb twice in this sequence because we want the real rbp
174  // to be under the return like a normal enter.
175
176  __ enter();          // rsp becomes 16-byte aligned here
177  __ push_CPU_state(); // Push a multiple of 16 bytes
178
179  // push cpu state handles this on EVEX enabled targets
180  if (save_vectors) {
181    // Save upper half of YMM registers(0..15)
182    int base_addr = XSAVE_AREA_YMM_BEGIN;
183    for (int n = 0; n < 16; n++) {
184      __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
185    }
186    if (VM_Version::supports_evex()) {
187      // Save upper half of ZMM registers(0..15)
188      base_addr = XSAVE_AREA_ZMM_BEGIN;
189      for (int n = 0; n < 16; n++) {
190        __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
191      }
192      // Save full ZMM registers(16..num_xmm_regs)
193      base_addr = XSAVE_AREA_UPPERBANK;
194      off = 0;
195      int vector_len = Assembler::AVX_512bit;
196      for (int n = 16; n < num_xmm_regs; n++) {
197        __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
198      }
199    }
200  } else {
201    if (VM_Version::supports_evex()) {
202      // Save upper bank of ZMM registers(16..31) for double/float usage
203      int base_addr = XSAVE_AREA_UPPERBANK;
204      off = 0;
205      for (int n = 16; n < num_xmm_regs; n++) {
206        __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
207      }
208    }
209  }
210  __ vzeroupper();
211  if (frame::arg_reg_save_area_bytes != 0) {
212    // Allocate argument register save area
213    __ subptr(rsp, frame::arg_reg_save_area_bytes);
214  }
215
216  // Set an oopmap for the call site.  This oopmap will map all
217  // oop-registers and debug-info registers as callee-saved.  This
218  // will allow deoptimization at this safepoint to find all possible
219  // debug-info recordings, as well as let GC find all oops.
220
221  OopMapSet *oop_maps = new OopMapSet();
222  OopMap* map = new OopMap(frame_size_in_slots, 0);
223
224#define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
225
226  map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
227  map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
228  map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
229  map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
230  // rbp location is known implicitly by the frame sender code, needs no oopmap
231  // and the location where rbp was saved by is ignored
232  map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
233  map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
234  map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
235  map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
236  map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
237  map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
238  map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
239  map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
240  map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
241  map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
242  // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
243  // on EVEX enabled targets, we get it included in the xsave area
244  off = xmm0_off;
245  int delta = xmm1_off - off;
246  for (int n = 0; n < 16; n++) {
247    XMMRegister xmm_name = as_XMMRegister(n);
248    map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
249    off += delta;
250  }
251  if(UseAVX > 2) {
252    // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
253    off = zmm16_off;
254    delta = zmm17_off - off;
255    for (int n = 16; n < num_xmm_regs; n++) {
256      XMMRegister zmm_name = as_XMMRegister(n);
257      map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
258      off += delta;
259    }
260  }
261
262#if defined(COMPILER2) || INCLUDE_JVMCI
263  if (save_vectors) {
264    off = ymm0_off;
265    int delta = ymm1_off - off;
266    for (int n = 0; n < 16; n++) {
267      XMMRegister ymm_name = as_XMMRegister(n);
268      map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
269      off += delta;
270    }
271  }
272#endif // COMPILER2 || INCLUDE_JVMCI
273
274  // %%% These should all be a waste but we'll keep things as they were for now
275  if (true) {
276    map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
277    map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
278    map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
279    map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
280    // rbp location is known implicitly by the frame sender code, needs no oopmap
281    map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
282    map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
283    map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
284    map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
285    map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
286    map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
287    map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
288    map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
289    map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
290    map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
291    // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
292    // on EVEX enabled targets, we get it included in the xsave area
293    off = xmm0H_off;
294    delta = xmm1H_off - off;
295    for (int n = 0; n < 16; n++) {
296      XMMRegister xmm_name = as_XMMRegister(n);
297      map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
298      off += delta;
299    }
300    if (UseAVX > 2) {
301      // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
302      off = zmm16H_off;
303      delta = zmm17H_off - off;
304      for (int n = 16; n < num_xmm_regs; n++) {
305        XMMRegister zmm_name = as_XMMRegister(n);
306        map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
307        off += delta;
308      }
309    }
310  }
311
312  return map;
313}
314
315void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
316  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
317  if (UseAVX < 3) {
318    num_xmm_regs = num_xmm_regs/2;
319  }
320  if (frame::arg_reg_save_area_bytes != 0) {
321    // Pop arg register save area
322    __ addptr(rsp, frame::arg_reg_save_area_bytes);
323  }
324
325#if defined(COMPILER2) || INCLUDE_JVMCI
326  if (restore_vectors) {
327    assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
328    assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
329  }
330#else
331  assert(!restore_vectors, "vectors are generated only by C2");
332#endif
333
334  __ vzeroupper();
335
336  // On EVEX enabled targets everything is handled in pop fpu state
337  if (restore_vectors) {
338    // Restore upper half of YMM registers (0..15)
339    int base_addr = XSAVE_AREA_YMM_BEGIN;
340    for (int n = 0; n < 16; n++) {
341      __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
342    }
343    if (VM_Version::supports_evex()) {
344      // Restore upper half of ZMM registers (0..15)
345      base_addr = XSAVE_AREA_ZMM_BEGIN;
346      for (int n = 0; n < 16; n++) {
347        __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
348      }
349      // Restore full ZMM registers(16..num_xmm_regs)
350      base_addr = XSAVE_AREA_UPPERBANK;
351      int vector_len = Assembler::AVX_512bit;
352      int off = 0;
353      for (int n = 16; n < num_xmm_regs; n++) {
354        __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
355      }
356    }
357  } else {
358    if (VM_Version::supports_evex()) {
359      // Restore upper bank of ZMM registers(16..31) for double/float usage
360      int base_addr = XSAVE_AREA_UPPERBANK;
361      int off = 0;
362      for (int n = 16; n < num_xmm_regs; n++) {
363        __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)));
364      }
365    }
366  }
367
368  // Recover CPU state
369  __ pop_CPU_state();
370  // Get the rbp described implicitly by the calling convention (no oopMap)
371  __ pop(rbp);
372}
373
374void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
375
376  // Just restore result register. Only used by deoptimization. By
377  // now any callee save register that needs to be restored to a c2
378  // caller of the deoptee has been extracted into the vframeArray
379  // and will be stuffed into the c2i adapter we create for later
380  // restoration so only result registers need to be restored here.
381
382  // Restore fp result register
383  __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
384  // Restore integer result register
385  __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
386  __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
387
388  // Pop all of the register save are off the stack except the return address
389  __ addptr(rsp, return_offset_in_bytes());
390}
391
392// Is vector's size (in bytes) bigger than a size saved by default?
393// 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
394bool SharedRuntime::is_wide_vector(int size) {
395  return size > 16;
396}
397
398size_t SharedRuntime::trampoline_size() {
399  return 16;
400}
401
402void SharedRuntime::generate_trampoline(MacroAssembler *masm, address destination) {
403  __ jump(RuntimeAddress(destination));
404}
405
406// The java_calling_convention describes stack locations as ideal slots on
407// a frame with no abi restrictions. Since we must observe abi restrictions
408// (like the placement of the register window) the slots must be biased by
409// the following value.
410static int reg2offset_in(VMReg r) {
411  // Account for saved rbp and return address
412  // This should really be in_preserve_stack_slots
413  return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
414}
415
416static int reg2offset_out(VMReg r) {
417  return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
418}
419
420// ---------------------------------------------------------------------------
421// Read the array of BasicTypes from a signature, and compute where the
422// arguments should go.  Values in the VMRegPair regs array refer to 4-byte
423// quantities.  Values less than VMRegImpl::stack0 are registers, those above
424// refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
425// as framesizes are fixed.
426// VMRegImpl::stack0 refers to the first slot 0(sp).
427// and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
428// up to RegisterImpl::number_of_registers) are the 64-bit
429// integer registers.
430
431// Note: the INPUTS in sig_bt are in units of Java argument words, which are
432// either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
433// units regardless of build. Of course for i486 there is no 64 bit build
434
435// The Java calling convention is a "shifted" version of the C ABI.
436// By skipping the first C ABI register we can call non-static jni methods
437// with small numbers of arguments without having to shuffle the arguments
438// at all. Since we control the java ABI we ought to at least get some
439// advantage out of it.
440
441int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
442                                           VMRegPair *regs,
443                                           int total_args_passed,
444                                           int is_outgoing) {
445
446  // Create the mapping between argument positions and
447  // registers.
448  static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
449    j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
450  };
451  static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
452    j_farg0, j_farg1, j_farg2, j_farg3,
453    j_farg4, j_farg5, j_farg6, j_farg7
454  };
455
456
457  uint int_args = 0;
458  uint fp_args = 0;
459  uint stk_args = 0; // inc by 2 each time
460
461  for (int i = 0; i < total_args_passed; i++) {
462    switch (sig_bt[i]) {
463    case T_BOOLEAN:
464    case T_CHAR:
465    case T_BYTE:
466    case T_SHORT:
467    case T_INT:
468      if (int_args < Argument::n_int_register_parameters_j) {
469        regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
470      } else {
471        regs[i].set1(VMRegImpl::stack2reg(stk_args));
472        stk_args += 2;
473      }
474      break;
475    case T_VOID:
476      // halves of T_LONG or T_DOUBLE
477      assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
478      regs[i].set_bad();
479      break;
480    case T_LONG:
481      assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
482      // fall through
483    case T_OBJECT:
484    case T_ARRAY:
485    case T_ADDRESS:
486      if (int_args < Argument::n_int_register_parameters_j) {
487        regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
488      } else {
489        regs[i].set2(VMRegImpl::stack2reg(stk_args));
490        stk_args += 2;
491      }
492      break;
493    case T_FLOAT:
494      if (fp_args < Argument::n_float_register_parameters_j) {
495        regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
496      } else {
497        regs[i].set1(VMRegImpl::stack2reg(stk_args));
498        stk_args += 2;
499      }
500      break;
501    case T_DOUBLE:
502      assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
503      if (fp_args < Argument::n_float_register_parameters_j) {
504        regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
505      } else {
506        regs[i].set2(VMRegImpl::stack2reg(stk_args));
507        stk_args += 2;
508      }
509      break;
510    default:
511      ShouldNotReachHere();
512      break;
513    }
514  }
515
516  return align_up(stk_args, 2);
517}
518
519// Patch the callers callsite with entry to compiled code if it exists.
520static void patch_callers_callsite(MacroAssembler *masm) {
521  Label L;
522  __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
523  __ jcc(Assembler::equal, L);
524
525  // Save the current stack pointer
526  __ mov(r13, rsp);
527  // Schedule the branch target address early.
528  // Call into the VM to patch the caller, then jump to compiled callee
529  // rax isn't live so capture return address while we easily can
530  __ movptr(rax, Address(rsp, 0));
531
532  // align stack so push_CPU_state doesn't fault
533  __ andptr(rsp, -(StackAlignmentInBytes));
534  __ push_CPU_state();
535  __ vzeroupper();
536  // VM needs caller's callsite
537  // VM needs target method
538  // This needs to be a long call since we will relocate this adapter to
539  // the codeBuffer and it may not reach
540
541  // Allocate argument register save area
542  if (frame::arg_reg_save_area_bytes != 0) {
543    __ subptr(rsp, frame::arg_reg_save_area_bytes);
544  }
545  __ mov(c_rarg0, rbx);
546  __ mov(c_rarg1, rax);
547  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
548
549  // De-allocate argument register save area
550  if (frame::arg_reg_save_area_bytes != 0) {
551    __ addptr(rsp, frame::arg_reg_save_area_bytes);
552  }
553
554  __ vzeroupper();
555  __ pop_CPU_state();
556  // restore sp
557  __ mov(rsp, r13);
558  __ bind(L);
559}
560
561
562static void gen_c2i_adapter(MacroAssembler *masm,
563                            int total_args_passed,
564                            int comp_args_on_stack,
565                            const BasicType *sig_bt,
566                            const VMRegPair *regs,
567                            Label& skip_fixup) {
568  // Before we get into the guts of the C2I adapter, see if we should be here
569  // at all.  We've come from compiled code and are attempting to jump to the
570  // interpreter, which means the caller made a static call to get here
571  // (vcalls always get a compiled target if there is one).  Check for a
572  // compiled target.  If there is one, we need to patch the caller's call.
573  patch_callers_callsite(masm);
574
575  __ bind(skip_fixup);
576
577  // Since all args are passed on the stack, total_args_passed *
578  // Interpreter::stackElementSize is the space we need. Plus 1 because
579  // we also account for the return address location since
580  // we store it first rather than hold it in rax across all the shuffling
581
582  int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
583
584  // stack is aligned, keep it that way
585  extraspace = align_up(extraspace, 2*wordSize);
586
587  // Get return address
588  __ pop(rax);
589
590  // set senderSP value
591  __ mov(r13, rsp);
592
593  __ subptr(rsp, extraspace);
594
595  // Store the return address in the expected location
596  __ movptr(Address(rsp, 0), rax);
597
598  // Now write the args into the outgoing interpreter space
599  for (int i = 0; i < total_args_passed; i++) {
600    if (sig_bt[i] == T_VOID) {
601      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
602      continue;
603    }
604
605    // offset to start parameters
606    int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
607    int next_off = st_off - Interpreter::stackElementSize;
608
609    // Say 4 args:
610    // i   st_off
611    // 0   32 T_LONG
612    // 1   24 T_VOID
613    // 2   16 T_OBJECT
614    // 3    8 T_BOOL
615    // -    0 return address
616    //
617    // However to make thing extra confusing. Because we can fit a long/double in
618    // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
619    // leaves one slot empty and only stores to a single slot. In this case the
620    // slot that is occupied is the T_VOID slot. See I said it was confusing.
621
622    VMReg r_1 = regs[i].first();
623    VMReg r_2 = regs[i].second();
624    if (!r_1->is_valid()) {
625      assert(!r_2->is_valid(), "");
626      continue;
627    }
628    if (r_1->is_stack()) {
629      // memory to memory use rax
630      int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
631      if (!r_2->is_valid()) {
632        // sign extend??
633        __ movl(rax, Address(rsp, ld_off));
634        __ movptr(Address(rsp, st_off), rax);
635
636      } else {
637
638        __ movq(rax, Address(rsp, ld_off));
639
640        // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
641        // T_DOUBLE and T_LONG use two slots in the interpreter
642        if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
643          // ld_off == LSW, ld_off+wordSize == MSW
644          // st_off == MSW, next_off == LSW
645          __ movq(Address(rsp, next_off), rax);
646#ifdef ASSERT
647          // Overwrite the unused slot with known junk
648          __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
649          __ movptr(Address(rsp, st_off), rax);
650#endif /* ASSERT */
651        } else {
652          __ movq(Address(rsp, st_off), rax);
653        }
654      }
655    } else if (r_1->is_Register()) {
656      Register r = r_1->as_Register();
657      if (!r_2->is_valid()) {
658        // must be only an int (or less ) so move only 32bits to slot
659        // why not sign extend??
660        __ movl(Address(rsp, st_off), r);
661      } else {
662        // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
663        // T_DOUBLE and T_LONG use two slots in the interpreter
664        if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
665          // long/double in gpr
666#ifdef ASSERT
667          // Overwrite the unused slot with known junk
668          __ mov64(rax, CONST64(0xdeadffffdeadaaab));
669          __ movptr(Address(rsp, st_off), rax);
670#endif /* ASSERT */
671          __ movq(Address(rsp, next_off), r);
672        } else {
673          __ movptr(Address(rsp, st_off), r);
674        }
675      }
676    } else {
677      assert(r_1->is_XMMRegister(), "");
678      if (!r_2->is_valid()) {
679        // only a float use just part of the slot
680        __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
681      } else {
682#ifdef ASSERT
683        // Overwrite the unused slot with known junk
684        __ mov64(rax, CONST64(0xdeadffffdeadaaac));
685        __ movptr(Address(rsp, st_off), rax);
686#endif /* ASSERT */
687        __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
688      }
689    }
690  }
691
692  // Schedule the branch target address early.
693  __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
694  __ jmp(rcx);
695}
696
697static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
698                        address code_start, address code_end,
699                        Label& L_ok) {
700  Label L_fail;
701  __ lea(temp_reg, ExternalAddress(code_start));
702  __ cmpptr(pc_reg, temp_reg);
703  __ jcc(Assembler::belowEqual, L_fail);
704  __ lea(temp_reg, ExternalAddress(code_end));
705  __ cmpptr(pc_reg, temp_reg);
706  __ jcc(Assembler::below, L_ok);
707  __ bind(L_fail);
708}
709
710void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
711                                    int total_args_passed,
712                                    int comp_args_on_stack,
713                                    const BasicType *sig_bt,
714                                    const VMRegPair *regs) {
715
716  // Note: r13 contains the senderSP on entry. We must preserve it since
717  // we may do a i2c -> c2i transition if we lose a race where compiled
718  // code goes non-entrant while we get args ready.
719  // In addition we use r13 to locate all the interpreter args as
720  // we must align the stack to 16 bytes on an i2c entry else we
721  // lose alignment we expect in all compiled code and register
722  // save code can segv when fxsave instructions find improperly
723  // aligned stack pointer.
724
725  // Adapters can be frameless because they do not require the caller
726  // to perform additional cleanup work, such as correcting the stack pointer.
727  // An i2c adapter is frameless because the *caller* frame, which is interpreted,
728  // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
729  // even if a callee has modified the stack pointer.
730  // A c2i adapter is frameless because the *callee* frame, which is interpreted,
731  // routinely repairs its caller's stack pointer (from sender_sp, which is set
732  // up via the senderSP register).
733  // In other words, if *either* the caller or callee is interpreted, we can
734  // get the stack pointer repaired after a call.
735  // This is why c2i and i2c adapters cannot be indefinitely composed.
736  // In particular, if a c2i adapter were to somehow call an i2c adapter,
737  // both caller and callee would be compiled methods, and neither would
738  // clean up the stack pointer changes performed by the two adapters.
739  // If this happens, control eventually transfers back to the compiled
740  // caller, but with an uncorrected stack, causing delayed havoc.
741
742  // Pick up the return address
743  __ movptr(rax, Address(rsp, 0));
744
745  if (VerifyAdapterCalls &&
746      (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
747    // So, let's test for cascading c2i/i2c adapters right now.
748    //  assert(Interpreter::contains($return_addr) ||
749    //         StubRoutines::contains($return_addr),
750    //         "i2c adapter must return to an interpreter frame");
751    __ block_comment("verify_i2c { ");
752    Label L_ok;
753    if (Interpreter::code() != NULL)
754      range_check(masm, rax, r11,
755                  Interpreter::code()->code_start(), Interpreter::code()->code_end(),
756                  L_ok);
757    if (StubRoutines::code1() != NULL)
758      range_check(masm, rax, r11,
759                  StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
760                  L_ok);
761    if (StubRoutines::code2() != NULL)
762      range_check(masm, rax, r11,
763                  StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
764                  L_ok);
765    const char* msg = "i2c adapter must return to an interpreter frame";
766    __ block_comment(msg);
767    __ stop(msg);
768    __ bind(L_ok);
769    __ block_comment("} verify_i2ce ");
770  }
771
772  // Must preserve original SP for loading incoming arguments because
773  // we need to align the outgoing SP for compiled code.
774  __ movptr(r11, rsp);
775
776  // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
777  // in registers, we will occasionally have no stack args.
778  int comp_words_on_stack = 0;
779  if (comp_args_on_stack) {
780    // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
781    // registers are below.  By subtracting stack0, we either get a negative
782    // number (all values in registers) or the maximum stack slot accessed.
783
784    // Convert 4-byte c2 stack slots to words.
785    comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
786    // Round up to miminum stack alignment, in wordSize
787    comp_words_on_stack = align_up(comp_words_on_stack, 2);
788    __ subptr(rsp, comp_words_on_stack * wordSize);
789  }
790
791
792  // Ensure compiled code always sees stack at proper alignment
793  __ andptr(rsp, -16);
794
795  // push the return address and misalign the stack that youngest frame always sees
796  // as far as the placement of the call instruction
797  __ push(rax);
798
799  // Put saved SP in another register
800  const Register saved_sp = rax;
801  __ movptr(saved_sp, r11);
802
803  // Will jump to the compiled code just as if compiled code was doing it.
804  // Pre-load the register-jump target early, to schedule it better.
805  __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
806
807#if INCLUDE_JVMCI
808  if (EnableJVMCI || UseAOT) {
809    // check if this call should be routed towards a specific entry point
810    __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
811    Label no_alternative_target;
812    __ jcc(Assembler::equal, no_alternative_target);
813    __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
814    __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
815    __ bind(no_alternative_target);
816  }
817#endif // INCLUDE_JVMCI
818
819  // Now generate the shuffle code.  Pick up all register args and move the
820  // rest through the floating point stack top.
821  for (int i = 0; i < total_args_passed; i++) {
822    if (sig_bt[i] == T_VOID) {
823      // Longs and doubles are passed in native word order, but misaligned
824      // in the 32-bit build.
825      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
826      continue;
827    }
828
829    // Pick up 0, 1 or 2 words from SP+offset.
830
831    assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
832            "scrambled load targets?");
833    // Load in argument order going down.
834    int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
835    // Point to interpreter value (vs. tag)
836    int next_off = ld_off - Interpreter::stackElementSize;
837    //
838    //
839    //
840    VMReg r_1 = regs[i].first();
841    VMReg r_2 = regs[i].second();
842    if (!r_1->is_valid()) {
843      assert(!r_2->is_valid(), "");
844      continue;
845    }
846    if (r_1->is_stack()) {
847      // Convert stack slot to an SP offset (+ wordSize to account for return address )
848      int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
849
850      // We can use r13 as a temp here because compiled code doesn't need r13 as an input
851      // and if we end up going thru a c2i because of a miss a reasonable value of r13
852      // will be generated.
853      if (!r_2->is_valid()) {
854        // sign extend???
855        __ movl(r13, Address(saved_sp, ld_off));
856        __ movptr(Address(rsp, st_off), r13);
857      } else {
858        //
859        // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
860        // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
861        // So we must adjust where to pick up the data to match the interpreter.
862        //
863        // Interpreter local[n] == MSW, local[n+1] == LSW however locals
864        // are accessed as negative so LSW is at LOW address
865
866        // ld_off is MSW so get LSW
867        const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
868                           next_off : ld_off;
869        __ movq(r13, Address(saved_sp, offset));
870        // st_off is LSW (i.e. reg.first())
871        __ movq(Address(rsp, st_off), r13);
872      }
873    } else if (r_1->is_Register()) {  // Register argument
874      Register r = r_1->as_Register();
875      assert(r != rax, "must be different");
876      if (r_2->is_valid()) {
877        //
878        // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
879        // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
880        // So we must adjust where to pick up the data to match the interpreter.
881
882        const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
883                           next_off : ld_off;
884
885        // this can be a misaligned move
886        __ movq(r, Address(saved_sp, offset));
887      } else {
888        // sign extend and use a full word?
889        __ movl(r, Address(saved_sp, ld_off));
890      }
891    } else {
892      if (!r_2->is_valid()) {
893        __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
894      } else {
895        __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
896      }
897    }
898  }
899
900  // 6243940 We might end up in handle_wrong_method if
901  // the callee is deoptimized as we race thru here. If that
902  // happens we don't want to take a safepoint because the
903  // caller frame will look interpreted and arguments are now
904  // "compiled" so it is much better to make this transition
905  // invisible to the stack walking code. Unfortunately if
906  // we try and find the callee by normal means a safepoint
907  // is possible. So we stash the desired callee in the thread
908  // and the vm will find there should this case occur.
909
910  __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
911
912  // put Method* where a c2i would expect should we end up there
913  // only needed becaus eof c2 resolve stubs return Method* as a result in
914  // rax
915  __ mov(rax, rbx);
916  __ jmp(r11);
917}
918
919// ---------------------------------------------------------------
920AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
921                                                            int total_args_passed,
922                                                            int comp_args_on_stack,
923                                                            const BasicType *sig_bt,
924                                                            const VMRegPair *regs,
925                                                            AdapterFingerPrint* fingerprint) {
926  address i2c_entry = __ pc();
927
928  gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
929
930  // -------------------------------------------------------------------------
931  // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
932  // to the interpreter.  The args start out packed in the compiled layout.  They
933  // need to be unpacked into the interpreter layout.  This will almost always
934  // require some stack space.  We grow the current (compiled) stack, then repack
935  // the args.  We  finally end in a jump to the generic interpreter entry point.
936  // On exit from the interpreter, the interpreter will restore our SP (lest the
937  // compiled code, which relys solely on SP and not RBP, get sick).
938
939  address c2i_unverified_entry = __ pc();
940  Label skip_fixup;
941  Label ok;
942
943  Register holder = rax;
944  Register receiver = j_rarg0;
945  Register temp = rbx;
946
947  {
948    __ load_klass(temp, receiver);
949    __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
950    __ movptr(rbx, Address(holder, CompiledICHolder::holder_method_offset()));
951    __ jcc(Assembler::equal, ok);
952    __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
953
954    __ bind(ok);
955    // Method might have been compiled since the call site was patched to
956    // interpreted if that is the case treat it as a miss so we can get
957    // the call site corrected.
958    __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
959    __ jcc(Assembler::equal, skip_fixup);
960    __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
961  }
962
963  address c2i_entry = __ pc();
964
965  gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
966
967  __ flush();
968  return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry);
969}
970
971int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
972                                         VMRegPair *regs,
973                                         VMRegPair *regs2,
974                                         int total_args_passed) {
975  assert(regs2 == NULL, "not needed on x86");
976// We return the amount of VMRegImpl stack slots we need to reserve for all
977// the arguments NOT counting out_preserve_stack_slots.
978
979// NOTE: These arrays will have to change when c1 is ported
980#ifdef _WIN64
981    static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
982      c_rarg0, c_rarg1, c_rarg2, c_rarg3
983    };
984    static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
985      c_farg0, c_farg1, c_farg2, c_farg3
986    };
987#else
988    static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
989      c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
990    };
991    static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
992      c_farg0, c_farg1, c_farg2, c_farg3,
993      c_farg4, c_farg5, c_farg6, c_farg7
994    };
995#endif // _WIN64
996
997
998    uint int_args = 0;
999    uint fp_args = 0;
1000    uint stk_args = 0; // inc by 2 each time
1001
1002    for (int i = 0; i < total_args_passed; i++) {
1003      switch (sig_bt[i]) {
1004      case T_BOOLEAN:
1005      case T_CHAR:
1006      case T_BYTE:
1007      case T_SHORT:
1008      case T_INT:
1009        if (int_args < Argument::n_int_register_parameters_c) {
1010          regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1011#ifdef _WIN64
1012          fp_args++;
1013          // Allocate slots for callee to stuff register args the stack.
1014          stk_args += 2;
1015#endif
1016        } else {
1017          regs[i].set1(VMRegImpl::stack2reg(stk_args));
1018          stk_args += 2;
1019        }
1020        break;
1021      case T_LONG:
1022        assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1023        // fall through
1024      case T_OBJECT:
1025      case T_ARRAY:
1026      case T_ADDRESS:
1027      case T_METADATA:
1028        if (int_args < Argument::n_int_register_parameters_c) {
1029          regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1030#ifdef _WIN64
1031          fp_args++;
1032          stk_args += 2;
1033#endif
1034        } else {
1035          regs[i].set2(VMRegImpl::stack2reg(stk_args));
1036          stk_args += 2;
1037        }
1038        break;
1039      case T_FLOAT:
1040        if (fp_args < Argument::n_float_register_parameters_c) {
1041          regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1042#ifdef _WIN64
1043          int_args++;
1044          // Allocate slots for callee to stuff register args the stack.
1045          stk_args += 2;
1046#endif
1047        } else {
1048          regs[i].set1(VMRegImpl::stack2reg(stk_args));
1049          stk_args += 2;
1050        }
1051        break;
1052      case T_DOUBLE:
1053        assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1054        if (fp_args < Argument::n_float_register_parameters_c) {
1055          regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1056#ifdef _WIN64
1057          int_args++;
1058          // Allocate slots for callee to stuff register args the stack.
1059          stk_args += 2;
1060#endif
1061        } else {
1062          regs[i].set2(VMRegImpl::stack2reg(stk_args));
1063          stk_args += 2;
1064        }
1065        break;
1066      case T_VOID: // Halves of longs and doubles
1067        assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1068        regs[i].set_bad();
1069        break;
1070      default:
1071        ShouldNotReachHere();
1072        break;
1073      }
1074    }
1075#ifdef _WIN64
1076  // windows abi requires that we always allocate enough stack space
1077  // for 4 64bit registers to be stored down.
1078  if (stk_args < 8) {
1079    stk_args = 8;
1080  }
1081#endif // _WIN64
1082
1083  return stk_args;
1084}
1085
1086// On 64 bit we will store integer like items to the stack as
1087// 64 bits items (sparc abi) even though java would only store
1088// 32bits for a parameter. On 32bit it will simply be 32 bits
1089// So this routine will do 32->32 on 32bit and 32->64 on 64bit
1090static void move32_64(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1091  if (src.first()->is_stack()) {
1092    if (dst.first()->is_stack()) {
1093      // stack to stack
1094      __ movslq(rax, Address(rbp, reg2offset_in(src.first())));
1095      __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
1096    } else {
1097      // stack to reg
1098      __ movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1099    }
1100  } else if (dst.first()->is_stack()) {
1101    // reg to stack
1102    // Do we really have to sign extend???
1103    // __ movslq(src.first()->as_Register(), src.first()->as_Register());
1104    __ movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1105  } else {
1106    // Do we really have to sign extend???
1107    // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1108    if (dst.first() != src.first()) {
1109      __ movq(dst.first()->as_Register(), src.first()->as_Register());
1110    }
1111  }
1112}
1113
1114static void move_ptr(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1115  if (src.first()->is_stack()) {
1116    if (dst.first()->is_stack()) {
1117      // stack to stack
1118      __ movq(rax, Address(rbp, reg2offset_in(src.first())));
1119      __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
1120    } else {
1121      // stack to reg
1122      __ movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1123    }
1124  } else if (dst.first()->is_stack()) {
1125    // reg to stack
1126    __ movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1127  } else {
1128    if (dst.first() != src.first()) {
1129      __ movq(dst.first()->as_Register(), src.first()->as_Register());
1130    }
1131  }
1132}
1133
1134// An oop arg. Must pass a handle not the oop itself
1135static void object_move(MacroAssembler* masm,
1136                        OopMap* map,
1137                        int oop_handle_offset,
1138                        int framesize_in_slots,
1139                        VMRegPair src,
1140                        VMRegPair dst,
1141                        bool is_receiver,
1142                        int* receiver_offset) {
1143
1144  // must pass a handle. First figure out the location we use as a handle
1145
1146  Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1147
1148  // See if oop is NULL if it is we need no handle
1149
1150  if (src.first()->is_stack()) {
1151
1152    // Oop is already on the stack as an argument
1153    int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1154    map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1155    if (is_receiver) {
1156      *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1157    }
1158
1159    __ cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
1160    __ lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1161    // conditionally move a NULL
1162    __ cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1163  } else {
1164
1165    // Oop is in an a register we must store it to the space we reserve
1166    // on the stack for oop_handles and pass a handle if oop is non-NULL
1167
1168    const Register rOop = src.first()->as_Register();
1169    int oop_slot;
1170    if (rOop == j_rarg0)
1171      oop_slot = 0;
1172    else if (rOop == j_rarg1)
1173      oop_slot = 1;
1174    else if (rOop == j_rarg2)
1175      oop_slot = 2;
1176    else if (rOop == j_rarg3)
1177      oop_slot = 3;
1178    else if (rOop == j_rarg4)
1179      oop_slot = 4;
1180    else {
1181      assert(rOop == j_rarg5, "wrong register");
1182      oop_slot = 5;
1183    }
1184
1185    oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1186    int offset = oop_slot*VMRegImpl::stack_slot_size;
1187
1188    map->set_oop(VMRegImpl::stack2reg(oop_slot));
1189    // Store oop in handle area, may be NULL
1190    __ movptr(Address(rsp, offset), rOop);
1191    if (is_receiver) {
1192      *receiver_offset = offset;
1193    }
1194
1195    __ cmpptr(rOop, (int32_t)NULL_WORD);
1196    __ lea(rHandle, Address(rsp, offset));
1197    // conditionally move a NULL from the handle area where it was just stored
1198    __ cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1199  }
1200
1201  // If arg is on the stack then place it otherwise it is already in correct reg.
1202  if (dst.first()->is_stack()) {
1203    __ movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1204  }
1205}
1206
1207// A float arg may have to do float reg int reg conversion
1208static void float_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1209  assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
1210
1211  // The calling conventions assures us that each VMregpair is either
1212  // all really one physical register or adjacent stack slots.
1213  // This greatly simplifies the cases here compared to sparc.
1214
1215  if (src.first()->is_stack()) {
1216    if (dst.first()->is_stack()) {
1217      __ movl(rax, Address(rbp, reg2offset_in(src.first())));
1218      __ movptr(Address(rsp, reg2offset_out(dst.first())), rax);
1219    } else {
1220      // stack to reg
1221      assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1222      __ movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first())));
1223    }
1224  } else if (dst.first()->is_stack()) {
1225    // reg to stack
1226    assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1227    __ movflt(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1228  } else {
1229    // reg to reg
1230    // In theory these overlap but the ordering is such that this is likely a nop
1231    if ( src.first() != dst.first()) {
1232      __ movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
1233    }
1234  }
1235}
1236
1237// A long move
1238static void long_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1239
1240  // The calling conventions assures us that each VMregpair is either
1241  // all really one physical register or adjacent stack slots.
1242  // This greatly simplifies the cases here compared to sparc.
1243
1244  if (src.is_single_phys_reg() ) {
1245    if (dst.is_single_phys_reg()) {
1246      if (dst.first() != src.first()) {
1247        __ mov(dst.first()->as_Register(), src.first()->as_Register());
1248      }
1249    } else {
1250      assert(dst.is_single_reg(), "not a stack pair");
1251      __ movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1252    }
1253  } else if (dst.is_single_phys_reg()) {
1254    assert(src.is_single_reg(),  "not a stack pair");
1255    __ movq(dst.first()->as_Register(), Address(rbp, reg2offset_out(src.first())));
1256  } else {
1257    assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
1258    __ movq(rax, Address(rbp, reg2offset_in(src.first())));
1259    __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
1260  }
1261}
1262
1263// A double move
1264static void double_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1265
1266  // The calling conventions assures us that each VMregpair is either
1267  // all really one physical register or adjacent stack slots.
1268  // This greatly simplifies the cases here compared to sparc.
1269
1270  if (src.is_single_phys_reg() ) {
1271    if (dst.is_single_phys_reg()) {
1272      // In theory these overlap but the ordering is such that this is likely a nop
1273      if ( src.first() != dst.first()) {
1274        __ movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
1275      }
1276    } else {
1277      assert(dst.is_single_reg(), "not a stack pair");
1278      __ movdbl(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1279    }
1280  } else if (dst.is_single_phys_reg()) {
1281    assert(src.is_single_reg(),  "not a stack pair");
1282    __ movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_out(src.first())));
1283  } else {
1284    assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
1285    __ movq(rax, Address(rbp, reg2offset_in(src.first())));
1286    __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
1287  }
1288}
1289
1290
1291void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1292  // We always ignore the frame_slots arg and just use the space just below frame pointer
1293  // which by this time is free to use
1294  switch (ret_type) {
1295  case T_FLOAT:
1296    __ movflt(Address(rbp, -wordSize), xmm0);
1297    break;
1298  case T_DOUBLE:
1299    __ movdbl(Address(rbp, -wordSize), xmm0);
1300    break;
1301  case T_VOID:  break;
1302  default: {
1303    __ movptr(Address(rbp, -wordSize), rax);
1304    }
1305  }
1306}
1307
1308void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1309  // We always ignore the frame_slots arg and just use the space just below frame pointer
1310  // which by this time is free to use
1311  switch (ret_type) {
1312  case T_FLOAT:
1313    __ movflt(xmm0, Address(rbp, -wordSize));
1314    break;
1315  case T_DOUBLE:
1316    __ movdbl(xmm0, Address(rbp, -wordSize));
1317    break;
1318  case T_VOID:  break;
1319  default: {
1320    __ movptr(rax, Address(rbp, -wordSize));
1321    }
1322  }
1323}
1324
1325static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1326    for ( int i = first_arg ; i < arg_count ; i++ ) {
1327      if (args[i].first()->is_Register()) {
1328        __ push(args[i].first()->as_Register());
1329      } else if (args[i].first()->is_XMMRegister()) {
1330        __ subptr(rsp, 2*wordSize);
1331        __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1332      }
1333    }
1334}
1335
1336static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1337    for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1338      if (args[i].first()->is_Register()) {
1339        __ pop(args[i].first()->as_Register());
1340      } else if (args[i].first()->is_XMMRegister()) {
1341        __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1342        __ addptr(rsp, 2*wordSize);
1343      }
1344    }
1345}
1346
1347
1348static void save_or_restore_arguments(MacroAssembler* masm,
1349                                      const int stack_slots,
1350                                      const int total_in_args,
1351                                      const int arg_save_area,
1352                                      OopMap* map,
1353                                      VMRegPair* in_regs,
1354                                      BasicType* in_sig_bt) {
1355  // if map is non-NULL then the code should store the values,
1356  // otherwise it should load them.
1357  int slot = arg_save_area;
1358  // Save down double word first
1359  for ( int i = 0; i < total_in_args; i++) {
1360    if (in_regs[i].first()->is_XMMRegister() && in_sig_bt[i] == T_DOUBLE) {
1361      int offset = slot * VMRegImpl::stack_slot_size;
1362      slot += VMRegImpl::slots_per_word;
1363      assert(slot <= stack_slots, "overflow");
1364      if (map != NULL) {
1365        __ movdbl(Address(rsp, offset), in_regs[i].first()->as_XMMRegister());
1366      } else {
1367        __ movdbl(in_regs[i].first()->as_XMMRegister(), Address(rsp, offset));
1368      }
1369    }
1370    if (in_regs[i].first()->is_Register() &&
1371        (in_sig_bt[i] == T_LONG || in_sig_bt[i] == T_ARRAY)) {
1372      int offset = slot * VMRegImpl::stack_slot_size;
1373      if (map != NULL) {
1374        __ movq(Address(rsp, offset), in_regs[i].first()->as_Register());
1375        if (in_sig_bt[i] == T_ARRAY) {
1376          map->set_oop(VMRegImpl::stack2reg(slot));;
1377        }
1378      } else {
1379        __ movq(in_regs[i].first()->as_Register(), Address(rsp, offset));
1380      }
1381      slot += VMRegImpl::slots_per_word;
1382    }
1383  }
1384  // Save or restore single word registers
1385  for ( int i = 0; i < total_in_args; i++) {
1386    if (in_regs[i].first()->is_Register()) {
1387      int offset = slot * VMRegImpl::stack_slot_size;
1388      slot++;
1389      assert(slot <= stack_slots, "overflow");
1390
1391      // Value is in an input register pass we must flush it to the stack
1392      const Register reg = in_regs[i].first()->as_Register();
1393      switch (in_sig_bt[i]) {
1394        case T_BOOLEAN:
1395        case T_CHAR:
1396        case T_BYTE:
1397        case T_SHORT:
1398        case T_INT:
1399          if (map != NULL) {
1400            __ movl(Address(rsp, offset), reg);
1401          } else {
1402            __ movl(reg, Address(rsp, offset));
1403          }
1404          break;
1405        case T_ARRAY:
1406        case T_LONG:
1407          // handled above
1408          break;
1409        case T_OBJECT:
1410        default: ShouldNotReachHere();
1411      }
1412    } else if (in_regs[i].first()->is_XMMRegister()) {
1413      if (in_sig_bt[i] == T_FLOAT) {
1414        int offset = slot * VMRegImpl::stack_slot_size;
1415        slot++;
1416        assert(slot <= stack_slots, "overflow");
1417        if (map != NULL) {
1418          __ movflt(Address(rsp, offset), in_regs[i].first()->as_XMMRegister());
1419        } else {
1420          __ movflt(in_regs[i].first()->as_XMMRegister(), Address(rsp, offset));
1421        }
1422      }
1423    } else if (in_regs[i].first()->is_stack()) {
1424      if (in_sig_bt[i] == T_ARRAY && map != NULL) {
1425        int offset_in_older_frame = in_regs[i].first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1426        map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + stack_slots));
1427      }
1428    }
1429  }
1430}
1431
1432
1433// Check GCLocker::needs_gc and enter the runtime if it's true.  This
1434// keeps a new JNI critical region from starting until a GC has been
1435// forced.  Save down any oops in registers and describe them in an
1436// OopMap.
1437static void check_needs_gc_for_critical_native(MacroAssembler* masm,
1438                                               int stack_slots,
1439                                               int total_c_args,
1440                                               int total_in_args,
1441                                               int arg_save_area,
1442                                               OopMapSet* oop_maps,
1443                                               VMRegPair* in_regs,
1444                                               BasicType* in_sig_bt) {
1445  __ block_comment("check GCLocker::needs_gc");
1446  Label cont;
1447  __ cmp8(ExternalAddress((address)GCLocker::needs_gc_address()), false);
1448  __ jcc(Assembler::equal, cont);
1449
1450  // Save down any incoming oops and call into the runtime to halt for a GC
1451
1452  OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1453  save_or_restore_arguments(masm, stack_slots, total_in_args,
1454                            arg_save_area, map, in_regs, in_sig_bt);
1455
1456  address the_pc = __ pc();
1457  oop_maps->add_gc_map( __ offset(), map);
1458  __ set_last_Java_frame(rsp, noreg, the_pc);
1459
1460  __ block_comment("block_for_jni_critical");
1461  __ movptr(c_rarg0, r15_thread);
1462  __ mov(r12, rsp); // remember sp
1463  __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1464  __ andptr(rsp, -16); // align stack as required by ABI
1465  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::block_for_jni_critical)));
1466  __ mov(rsp, r12); // restore sp
1467  __ reinit_heapbase();
1468
1469  __ reset_last_Java_frame(false);
1470
1471  save_or_restore_arguments(masm, stack_slots, total_in_args,
1472                            arg_save_area, NULL, in_regs, in_sig_bt);
1473  __ bind(cont);
1474#ifdef ASSERT
1475  if (StressCriticalJNINatives) {
1476    // Stress register saving
1477    OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1478    save_or_restore_arguments(masm, stack_slots, total_in_args,
1479                              arg_save_area, map, in_regs, in_sig_bt);
1480    // Destroy argument registers
1481    for (int i = 0; i < total_in_args - 1; i++) {
1482      if (in_regs[i].first()->is_Register()) {
1483        const Register reg = in_regs[i].first()->as_Register();
1484        __ xorptr(reg, reg);
1485      } else if (in_regs[i].first()->is_XMMRegister()) {
1486        __ xorpd(in_regs[i].first()->as_XMMRegister(), in_regs[i].first()->as_XMMRegister());
1487      } else if (in_regs[i].first()->is_FloatRegister()) {
1488        ShouldNotReachHere();
1489      } else if (in_regs[i].first()->is_stack()) {
1490        // Nothing to do
1491      } else {
1492        ShouldNotReachHere();
1493      }
1494      if (in_sig_bt[i] == T_LONG || in_sig_bt[i] == T_DOUBLE) {
1495        i++;
1496      }
1497    }
1498
1499    save_or_restore_arguments(masm, stack_slots, total_in_args,
1500                              arg_save_area, NULL, in_regs, in_sig_bt);
1501  }
1502#endif
1503}
1504
1505// Unpack an array argument into a pointer to the body and the length
1506// if the array is non-null, otherwise pass 0 for both.
1507static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) {
1508  Register tmp_reg = rax;
1509  assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg,
1510         "possible collision");
1511  assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg,
1512         "possible collision");
1513
1514  __ block_comment("unpack_array_argument {");
1515
1516  // Pass the length, ptr pair
1517  Label is_null, done;
1518  VMRegPair tmp;
1519  tmp.set_ptr(tmp_reg->as_VMReg());
1520  if (reg.first()->is_stack()) {
1521    // Load the arg up from the stack
1522    move_ptr(masm, reg, tmp);
1523    reg = tmp;
1524  }
1525  __ testptr(reg.first()->as_Register(), reg.first()->as_Register());
1526  __ jccb(Assembler::equal, is_null);
1527  __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1528  move_ptr(masm, tmp, body_arg);
1529  // load the length relative to the body.
1530  __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() -
1531                           arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1532  move32_64(masm, tmp, length_arg);
1533  __ jmpb(done);
1534  __ bind(is_null);
1535  // Pass zeros
1536  __ xorptr(tmp_reg, tmp_reg);
1537  move_ptr(masm, tmp, body_arg);
1538  move32_64(masm, tmp, length_arg);
1539  __ bind(done);
1540
1541  __ block_comment("} unpack_array_argument");
1542}
1543
1544
1545// Different signatures may require very different orders for the move
1546// to avoid clobbering other arguments.  There's no simple way to
1547// order them safely.  Compute a safe order for issuing stores and
1548// break any cycles in those stores.  This code is fairly general but
1549// it's not necessary on the other platforms so we keep it in the
1550// platform dependent code instead of moving it into a shared file.
1551// (See bugs 7013347 & 7145024.)
1552// Note that this code is specific to LP64.
1553class ComputeMoveOrder: public StackObj {
1554  class MoveOperation: public ResourceObj {
1555    friend class ComputeMoveOrder;
1556   private:
1557    VMRegPair        _src;
1558    VMRegPair        _dst;
1559    int              _src_index;
1560    int              _dst_index;
1561    bool             _processed;
1562    MoveOperation*  _next;
1563    MoveOperation*  _prev;
1564
1565    static int get_id(VMRegPair r) {
1566      return r.first()->value();
1567    }
1568
1569   public:
1570    MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1571      _src(src)
1572    , _src_index(src_index)
1573    , _dst(dst)
1574    , _dst_index(dst_index)
1575    , _next(NULL)
1576    , _prev(NULL)
1577    , _processed(false) {
1578    }
1579
1580    VMRegPair src() const              { return _src; }
1581    int src_id() const                 { return get_id(src()); }
1582    int src_index() const              { return _src_index; }
1583    VMRegPair dst() const              { return _dst; }
1584    void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1585    int dst_index() const              { return _dst_index; }
1586    int dst_id() const                 { return get_id(dst()); }
1587    MoveOperation* next() const       { return _next; }
1588    MoveOperation* prev() const       { return _prev; }
1589    void set_processed()               { _processed = true; }
1590    bool is_processed() const          { return _processed; }
1591
1592    // insert
1593    void break_cycle(VMRegPair temp_register) {
1594      // create a new store following the last store
1595      // to move from the temp_register to the original
1596      MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1597
1598      // break the cycle of links and insert new_store at the end
1599      // break the reverse link.
1600      MoveOperation* p = prev();
1601      assert(p->next() == this, "must be");
1602      _prev = NULL;
1603      p->_next = new_store;
1604      new_store->_prev = p;
1605
1606      // change the original store to save it's value in the temp.
1607      set_dst(-1, temp_register);
1608    }
1609
1610    void link(GrowableArray<MoveOperation*>& killer) {
1611      // link this store in front the store that it depends on
1612      MoveOperation* n = killer.at_grow(src_id(), NULL);
1613      if (n != NULL) {
1614        assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1615        _next = n;
1616        n->_prev = this;
1617      }
1618    }
1619  };
1620
1621 private:
1622  GrowableArray<MoveOperation*> edges;
1623
1624 public:
1625  ComputeMoveOrder(int total_in_args, VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1626                    BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1627    // Move operations where the dest is the stack can all be
1628    // scheduled first since they can't interfere with the other moves.
1629    for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1630      if (in_sig_bt[i] == T_ARRAY) {
1631        c_arg--;
1632        if (out_regs[c_arg].first()->is_stack() &&
1633            out_regs[c_arg + 1].first()->is_stack()) {
1634          arg_order.push(i);
1635          arg_order.push(c_arg);
1636        } else {
1637          if (out_regs[c_arg].first()->is_stack() ||
1638              in_regs[i].first() == out_regs[c_arg].first()) {
1639            add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1640          } else {
1641            add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1642          }
1643        }
1644      } else if (in_sig_bt[i] == T_VOID) {
1645        arg_order.push(i);
1646        arg_order.push(c_arg);
1647      } else {
1648        if (out_regs[c_arg].first()->is_stack() ||
1649            in_regs[i].first() == out_regs[c_arg].first()) {
1650          arg_order.push(i);
1651          arg_order.push(c_arg);
1652        } else {
1653          add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1654        }
1655      }
1656    }
1657    // Break any cycles in the register moves and emit the in the
1658    // proper order.
1659    GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1660    for (int i = 0; i < stores->length(); i++) {
1661      arg_order.push(stores->at(i)->src_index());
1662      arg_order.push(stores->at(i)->dst_index());
1663    }
1664 }
1665
1666  // Collected all the move operations
1667  void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1668    if (src.first() == dst.first()) return;
1669    edges.append(new MoveOperation(src_index, src, dst_index, dst));
1670  }
1671
1672  // Walk the edges breaking cycles between moves.  The result list
1673  // can be walked in order to produce the proper set of loads
1674  GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1675    // Record which moves kill which values
1676    GrowableArray<MoveOperation*> killer;
1677    for (int i = 0; i < edges.length(); i++) {
1678      MoveOperation* s = edges.at(i);
1679      assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1680      killer.at_put_grow(s->dst_id(), s, NULL);
1681    }
1682    assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1683           "make sure temp isn't in the registers that are killed");
1684
1685    // create links between loads and stores
1686    for (int i = 0; i < edges.length(); i++) {
1687      edges.at(i)->link(killer);
1688    }
1689
1690    // at this point, all the move operations are chained together
1691    // in a doubly linked list.  Processing it backwards finds
1692    // the beginning of the chain, forwards finds the end.  If there's
1693    // a cycle it can be broken at any point,  so pick an edge and walk
1694    // backward until the list ends or we end where we started.
1695    GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1696    for (int e = 0; e < edges.length(); e++) {
1697      MoveOperation* s = edges.at(e);
1698      if (!s->is_processed()) {
1699        MoveOperation* start = s;
1700        // search for the beginning of the chain or cycle
1701        while (start->prev() != NULL && start->prev() != s) {
1702          start = start->prev();
1703        }
1704        if (start->prev() == s) {
1705          start->break_cycle(temp_register);
1706        }
1707        // walk the chain forward inserting to store list
1708        while (start != NULL) {
1709          stores->append(start);
1710          start->set_processed();
1711          start = start->next();
1712        }
1713      }
1714    }
1715    return stores;
1716  }
1717};
1718
1719static void verify_oop_args(MacroAssembler* masm,
1720                            const methodHandle& method,
1721                            const BasicType* sig_bt,
1722                            const VMRegPair* regs) {
1723  Register temp_reg = rbx;  // not part of any compiled calling seq
1724  if (VerifyOops) {
1725    for (int i = 0; i < method->size_of_parameters(); i++) {
1726      if (sig_bt[i] == T_OBJECT ||
1727          sig_bt[i] == T_ARRAY) {
1728        VMReg r = regs[i].first();
1729        assert(r->is_valid(), "bad oop arg");
1730        if (r->is_stack()) {
1731          __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1732          __ verify_oop(temp_reg);
1733        } else {
1734          __ verify_oop(r->as_Register());
1735        }
1736      }
1737    }
1738  }
1739}
1740
1741static void gen_special_dispatch(MacroAssembler* masm,
1742                                 methodHandle method,
1743                                 const BasicType* sig_bt,
1744                                 const VMRegPair* regs) {
1745  verify_oop_args(masm, method, sig_bt, regs);
1746  vmIntrinsics::ID iid = method->intrinsic_id();
1747
1748  // Now write the args into the outgoing interpreter space
1749  bool     has_receiver   = false;
1750  Register receiver_reg   = noreg;
1751  int      member_arg_pos = -1;
1752  Register member_reg     = noreg;
1753  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1754  if (ref_kind != 0) {
1755    member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1756    member_reg = rbx;  // known to be free at this point
1757    has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1758  } else if (iid == vmIntrinsics::_invokeBasic) {
1759    has_receiver = true;
1760  } else {
1761    fatal("unexpected intrinsic id %d", iid);
1762  }
1763
1764  if (member_reg != noreg) {
1765    // Load the member_arg into register, if necessary.
1766    SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1767    VMReg r = regs[member_arg_pos].first();
1768    if (r->is_stack()) {
1769      __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1770    } else {
1771      // no data motion is needed
1772      member_reg = r->as_Register();
1773    }
1774  }
1775
1776  if (has_receiver) {
1777    // Make sure the receiver is loaded into a register.
1778    assert(method->size_of_parameters() > 0, "oob");
1779    assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1780    VMReg r = regs[0].first();
1781    assert(r->is_valid(), "bad receiver arg");
1782    if (r->is_stack()) {
1783      // Porting note:  This assumes that compiled calling conventions always
1784      // pass the receiver oop in a register.  If this is not true on some
1785      // platform, pick a temp and load the receiver from stack.
1786      fatal("receiver always in a register");
1787      receiver_reg = j_rarg0;  // known to be free at this point
1788      __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1789    } else {
1790      // no data motion is needed
1791      receiver_reg = r->as_Register();
1792    }
1793  }
1794
1795  // Figure out which address we are really jumping to:
1796  MethodHandles::generate_method_handle_dispatch(masm, iid,
1797                                                 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1798}
1799
1800// ---------------------------------------------------------------------------
1801// Generate a native wrapper for a given method.  The method takes arguments
1802// in the Java compiled code convention, marshals them to the native
1803// convention (handlizes oops, etc), transitions to native, makes the call,
1804// returns to java state (possibly blocking), unhandlizes any result and
1805// returns.
1806//
1807// Critical native functions are a shorthand for the use of
1808// GetPrimtiveArrayCritical and disallow the use of any other JNI
1809// functions.  The wrapper is expected to unpack the arguments before
1810// passing them to the callee and perform checks before and after the
1811// native call to ensure that they GCLocker
1812// lock_critical/unlock_critical semantics are followed.  Some other
1813// parts of JNI setup are skipped like the tear down of the JNI handle
1814// block and the check for pending exceptions it's impossible for them
1815// to be thrown.
1816//
1817// They are roughly structured like this:
1818//    if (GCLocker::needs_gc())
1819//      SharedRuntime::block_for_jni_critical();
1820//    tranistion to thread_in_native
1821//    unpack arrray arguments and call native entry point
1822//    check for safepoint in progress
1823//    check if any thread suspend flags are set
1824//      call into JVM and possible unlock the JNI critical
1825//      if a GC was suppressed while in the critical native.
1826//    transition back to thread_in_Java
1827//    return to caller
1828//
1829nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1830                                                const methodHandle& method,
1831                                                int compile_id,
1832                                                BasicType* in_sig_bt,
1833                                                VMRegPair* in_regs,
1834                                                BasicType ret_type) {
1835  if (method->is_method_handle_intrinsic()) {
1836    vmIntrinsics::ID iid = method->intrinsic_id();
1837    intptr_t start = (intptr_t)__ pc();
1838    int vep_offset = ((intptr_t)__ pc()) - start;
1839    gen_special_dispatch(masm,
1840                         method,
1841                         in_sig_bt,
1842                         in_regs);
1843    int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1844    __ flush();
1845    int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1846    return nmethod::new_native_nmethod(method,
1847                                       compile_id,
1848                                       masm->code(),
1849                                       vep_offset,
1850                                       frame_complete,
1851                                       stack_slots / VMRegImpl::slots_per_word,
1852                                       in_ByteSize(-1),
1853                                       in_ByteSize(-1),
1854                                       (OopMapSet*)NULL);
1855  }
1856  bool is_critical_native = true;
1857  address native_func = method->critical_native_function();
1858  if (native_func == NULL) {
1859    native_func = method->native_function();
1860    is_critical_native = false;
1861  }
1862  assert(native_func != NULL, "must have function");
1863
1864  // An OopMap for lock (and class if static)
1865  OopMapSet *oop_maps = new OopMapSet();
1866  intptr_t start = (intptr_t)__ pc();
1867
1868  // We have received a description of where all the java arg are located
1869  // on entry to the wrapper. We need to convert these args to where
1870  // the jni function will expect them. To figure out where they go
1871  // we convert the java signature to a C signature by inserting
1872  // the hidden arguments as arg[0] and possibly arg[1] (static method)
1873
1874  const int total_in_args = method->size_of_parameters();
1875  int total_c_args = total_in_args;
1876  if (!is_critical_native) {
1877    total_c_args += 1;
1878    if (method->is_static()) {
1879      total_c_args++;
1880    }
1881  } else {
1882    for (int i = 0; i < total_in_args; i++) {
1883      if (in_sig_bt[i] == T_ARRAY) {
1884        total_c_args++;
1885      }
1886    }
1887  }
1888
1889  BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1890  VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1891  BasicType* in_elem_bt = NULL;
1892
1893  int argc = 0;
1894  if (!is_critical_native) {
1895    out_sig_bt[argc++] = T_ADDRESS;
1896    if (method->is_static()) {
1897      out_sig_bt[argc++] = T_OBJECT;
1898    }
1899
1900    for (int i = 0; i < total_in_args ; i++ ) {
1901      out_sig_bt[argc++] = in_sig_bt[i];
1902    }
1903  } else {
1904    Thread* THREAD = Thread::current();
1905    in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
1906    SignatureStream ss(method->signature());
1907    for (int i = 0; i < total_in_args ; i++ ) {
1908      if (in_sig_bt[i] == T_ARRAY) {
1909        // Arrays are passed as int, elem* pair
1910        out_sig_bt[argc++] = T_INT;
1911        out_sig_bt[argc++] = T_ADDRESS;
1912        Symbol* atype = ss.as_symbol(CHECK_NULL);
1913        const char* at = atype->as_C_string();
1914        if (strlen(at) == 2) {
1915          assert(at[0] == '[', "must be");
1916          switch (at[1]) {
1917            case 'B': in_elem_bt[i]  = T_BYTE; break;
1918            case 'C': in_elem_bt[i]  = T_CHAR; break;
1919            case 'D': in_elem_bt[i]  = T_DOUBLE; break;
1920            case 'F': in_elem_bt[i]  = T_FLOAT; break;
1921            case 'I': in_elem_bt[i]  = T_INT; break;
1922            case 'J': in_elem_bt[i]  = T_LONG; break;
1923            case 'S': in_elem_bt[i]  = T_SHORT; break;
1924            case 'Z': in_elem_bt[i]  = T_BOOLEAN; break;
1925            default: ShouldNotReachHere();
1926          }
1927        }
1928      } else {
1929        out_sig_bt[argc++] = in_sig_bt[i];
1930        in_elem_bt[i] = T_VOID;
1931      }
1932      if (in_sig_bt[i] != T_VOID) {
1933        assert(in_sig_bt[i] == ss.type(), "must match");
1934        ss.next();
1935      }
1936    }
1937  }
1938
1939  // Now figure out where the args must be stored and how much stack space
1940  // they require.
1941  int out_arg_slots;
1942  out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1943
1944  // Compute framesize for the wrapper.  We need to handlize all oops in
1945  // incoming registers
1946
1947  // Calculate the total number of stack slots we will need.
1948
1949  // First count the abi requirement plus all of the outgoing args
1950  int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1951
1952  // Now the space for the inbound oop handle area
1953  int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1954  if (is_critical_native) {
1955    // Critical natives may have to call out so they need a save area
1956    // for register arguments.
1957    int double_slots = 0;
1958    int single_slots = 0;
1959    for ( int i = 0; i < total_in_args; i++) {
1960      if (in_regs[i].first()->is_Register()) {
1961        const Register reg = in_regs[i].first()->as_Register();
1962        switch (in_sig_bt[i]) {
1963          case T_BOOLEAN:
1964          case T_BYTE:
1965          case T_SHORT:
1966          case T_CHAR:
1967          case T_INT:  single_slots++; break;
1968          case T_ARRAY:  // specific to LP64 (7145024)
1969          case T_LONG: double_slots++; break;
1970          default:  ShouldNotReachHere();
1971        }
1972      } else if (in_regs[i].first()->is_XMMRegister()) {
1973        switch (in_sig_bt[i]) {
1974          case T_FLOAT:  single_slots++; break;
1975          case T_DOUBLE: double_slots++; break;
1976          default:  ShouldNotReachHere();
1977        }
1978      } else if (in_regs[i].first()->is_FloatRegister()) {
1979        ShouldNotReachHere();
1980      }
1981    }
1982    total_save_slots = double_slots * 2 + single_slots;
1983    // align the save area
1984    if (double_slots != 0) {
1985      stack_slots = align_up(stack_slots, 2);
1986    }
1987  }
1988
1989  int oop_handle_offset = stack_slots;
1990  stack_slots += total_save_slots;
1991
1992  // Now any space we need for handlizing a klass if static method
1993
1994  int klass_slot_offset = 0;
1995  int klass_offset = -1;
1996  int lock_slot_offset = 0;
1997  bool is_static = false;
1998
1999  if (method->is_static()) {
2000    klass_slot_offset = stack_slots;
2001    stack_slots += VMRegImpl::slots_per_word;
2002    klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2003    is_static = true;
2004  }
2005
2006  // Plus a lock if needed
2007
2008  if (method->is_synchronized()) {
2009    lock_slot_offset = stack_slots;
2010    stack_slots += VMRegImpl::slots_per_word;
2011  }
2012
2013  // Now a place (+2) to save return values or temp during shuffling
2014  // + 4 for return address (which we own) and saved rbp
2015  stack_slots += 6;
2016
2017  // Ok The space we have allocated will look like:
2018  //
2019  //
2020  // FP-> |                     |
2021  //      |---------------------|
2022  //      | 2 slots for moves   |
2023  //      |---------------------|
2024  //      | lock box (if sync)  |
2025  //      |---------------------| <- lock_slot_offset
2026  //      | klass (if static)   |
2027  //      |---------------------| <- klass_slot_offset
2028  //      | oopHandle area      |
2029  //      |---------------------| <- oop_handle_offset (6 java arg registers)
2030  //      | outbound memory     |
2031  //      | based arguments     |
2032  //      |                     |
2033  //      |---------------------|
2034  //      |                     |
2035  // SP-> | out_preserved_slots |
2036  //
2037  //
2038
2039
2040  // Now compute actual number of stack words we need rounding to make
2041  // stack properly aligned.
2042  stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2043
2044  int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2045
2046  // First thing make an ic check to see if we should even be here
2047
2048  // We are free to use all registers as temps without saving them and
2049  // restoring them except rbp. rbp is the only callee save register
2050  // as far as the interpreter and the compiler(s) are concerned.
2051
2052
2053  const Register ic_reg = rax;
2054  const Register receiver = j_rarg0;
2055
2056  Label hit;
2057  Label exception_pending;
2058
2059  assert_different_registers(ic_reg, receiver, rscratch1);
2060  __ verify_oop(receiver);
2061  __ load_klass(rscratch1, receiver);
2062  __ cmpq(ic_reg, rscratch1);
2063  __ jcc(Assembler::equal, hit);
2064
2065  __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
2066
2067  // Verified entry point must be aligned
2068  __ align(8);
2069
2070  __ bind(hit);
2071
2072  int vep_offset = ((intptr_t)__ pc()) - start;
2073
2074#ifdef COMPILER1
2075  // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2076  if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2077    inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2078  }
2079#endif // COMPILER1
2080
2081  // The instruction at the verified entry point must be 5 bytes or longer
2082  // because it can be patched on the fly by make_non_entrant. The stack bang
2083  // instruction fits that requirement.
2084
2085  // Generate stack overflow check
2086
2087  if (UseStackBanging) {
2088    __ bang_stack_with_offset((int)JavaThread::stack_shadow_zone_size());
2089  } else {
2090    // need a 5 byte instruction to allow MT safe patching to non-entrant
2091    __ fat_nop();
2092  }
2093
2094  // Generate a new frame for the wrapper.
2095  __ enter();
2096  // -2 because return address is already present and so is saved rbp
2097  __ subptr(rsp, stack_size - 2*wordSize);
2098
2099  // Frame is now completed as far as size and linkage.
2100  int frame_complete = ((intptr_t)__ pc()) - start;
2101
2102    if (UseRTMLocking) {
2103      // Abort RTM transaction before calling JNI
2104      // because critical section will be large and will be
2105      // aborted anyway. Also nmethod could be deoptimized.
2106      __ xabort(0);
2107    }
2108
2109#ifdef ASSERT
2110    {
2111      Label L;
2112      __ mov(rax, rsp);
2113      __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
2114      __ cmpptr(rax, rsp);
2115      __ jcc(Assembler::equal, L);
2116      __ stop("improperly aligned stack");
2117      __ bind(L);
2118    }
2119#endif /* ASSERT */
2120
2121
2122  // We use r14 as the oop handle for the receiver/klass
2123  // It is callee save so it survives the call to native
2124
2125  const Register oop_handle_reg = r14;
2126
2127  if (is_critical_native) {
2128    check_needs_gc_for_critical_native(masm, stack_slots, total_c_args, total_in_args,
2129                                       oop_handle_offset, oop_maps, in_regs, in_sig_bt);
2130  }
2131
2132  //
2133  // We immediately shuffle the arguments so that any vm call we have to
2134  // make from here on out (sync slow path, jvmti, etc.) we will have
2135  // captured the oops from our caller and have a valid oopMap for
2136  // them.
2137
2138  // -----------------
2139  // The Grand Shuffle
2140
2141  // The Java calling convention is either equal (linux) or denser (win64) than the
2142  // c calling convention. However the because of the jni_env argument the c calling
2143  // convention always has at least one more (and two for static) arguments than Java.
2144  // Therefore if we move the args from java -> c backwards then we will never have
2145  // a register->register conflict and we don't have to build a dependency graph
2146  // and figure out how to break any cycles.
2147  //
2148
2149  // Record esp-based slot for receiver on stack for non-static methods
2150  int receiver_offset = -1;
2151
2152  // This is a trick. We double the stack slots so we can claim
2153  // the oops in the caller's frame. Since we are sure to have
2154  // more args than the caller doubling is enough to make
2155  // sure we can capture all the incoming oop args from the
2156  // caller.
2157  //
2158  OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2159
2160  // Mark location of rbp (someday)
2161  // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2162
2163  // Use eax, ebx as temporaries during any memory-memory moves we have to do
2164  // All inbound args are referenced based on rbp and all outbound args via rsp.
2165
2166
2167#ifdef ASSERT
2168  bool reg_destroyed[RegisterImpl::number_of_registers];
2169  bool freg_destroyed[XMMRegisterImpl::number_of_registers];
2170  for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
2171    reg_destroyed[r] = false;
2172  }
2173  for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
2174    freg_destroyed[f] = false;
2175  }
2176
2177#endif /* ASSERT */
2178
2179  // This may iterate in two different directions depending on the
2180  // kind of native it is.  The reason is that for regular JNI natives
2181  // the incoming and outgoing registers are offset upwards and for
2182  // critical natives they are offset down.
2183  GrowableArray<int> arg_order(2 * total_in_args);
2184  VMRegPair tmp_vmreg;
2185  tmp_vmreg.set1(rbx->as_VMReg());
2186
2187  if (!is_critical_native) {
2188    for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2189      arg_order.push(i);
2190      arg_order.push(c_arg);
2191    }
2192  } else {
2193    // Compute a valid move order, using tmp_vmreg to break any cycles
2194    ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
2195  }
2196
2197  int temploc = -1;
2198  for (int ai = 0; ai < arg_order.length(); ai += 2) {
2199    int i = arg_order.at(ai);
2200    int c_arg = arg_order.at(ai + 1);
2201    __ block_comment(err_msg("move %d -> %d", i, c_arg));
2202    if (c_arg == -1) {
2203      assert(is_critical_native, "should only be required for critical natives");
2204      // This arg needs to be moved to a temporary
2205      __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
2206      in_regs[i] = tmp_vmreg;
2207      temploc = i;
2208      continue;
2209    } else if (i == -1) {
2210      assert(is_critical_native, "should only be required for critical natives");
2211      // Read from the temporary location
2212      assert(temploc != -1, "must be valid");
2213      i = temploc;
2214      temploc = -1;
2215    }
2216#ifdef ASSERT
2217    if (in_regs[i].first()->is_Register()) {
2218      assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2219    } else if (in_regs[i].first()->is_XMMRegister()) {
2220      assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2221    }
2222    if (out_regs[c_arg].first()->is_Register()) {
2223      reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2224    } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2225      freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2226    }
2227#endif /* ASSERT */
2228    switch (in_sig_bt[i]) {
2229      case T_ARRAY:
2230        if (is_critical_native) {
2231          unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
2232          c_arg++;
2233#ifdef ASSERT
2234          if (out_regs[c_arg].first()->is_Register()) {
2235            reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2236          } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2237            freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2238          }
2239#endif
2240          break;
2241        }
2242      case T_OBJECT:
2243        assert(!is_critical_native, "no oop arguments");
2244        object_move(masm, map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2245                    ((i == 0) && (!is_static)),
2246                    &receiver_offset);
2247        break;
2248      case T_VOID:
2249        break;
2250
2251      case T_FLOAT:
2252        float_move(masm, in_regs[i], out_regs[c_arg]);
2253          break;
2254
2255      case T_DOUBLE:
2256        assert( i + 1 < total_in_args &&
2257                in_sig_bt[i + 1] == T_VOID &&
2258                out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2259        double_move(masm, in_regs[i], out_regs[c_arg]);
2260        break;
2261
2262      case T_LONG :
2263        long_move(masm, in_regs[i], out_regs[c_arg]);
2264        break;
2265
2266      case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2267
2268      default:
2269        move32_64(masm, in_regs[i], out_regs[c_arg]);
2270    }
2271  }
2272
2273  int c_arg;
2274
2275  // Pre-load a static method's oop into r14.  Used both by locking code and
2276  // the normal JNI call code.
2277  if (!is_critical_native) {
2278    // point c_arg at the first arg that is already loaded in case we
2279    // need to spill before we call out
2280    c_arg = total_c_args - total_in_args;
2281
2282    if (method->is_static()) {
2283
2284      //  load oop into a register
2285      __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2286
2287      // Now handlize the static class mirror it's known not-null.
2288      __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2289      map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2290
2291      // Now get the handle
2292      __ lea(oop_handle_reg, Address(rsp, klass_offset));
2293      // store the klass handle as second argument
2294      __ movptr(c_rarg1, oop_handle_reg);
2295      // and protect the arg if we must spill
2296      c_arg--;
2297    }
2298  } else {
2299    // For JNI critical methods we need to save all registers in save_args.
2300    c_arg = 0;
2301  }
2302
2303  // Change state to native (we save the return address in the thread, since it might not
2304  // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
2305  // points into the right code segment. It does not have to be the correct return pc.
2306  // We use the same pc/oopMap repeatedly when we call out
2307
2308  intptr_t the_pc = (intptr_t) __ pc();
2309  oop_maps->add_gc_map(the_pc - start, map);
2310
2311  __ set_last_Java_frame(rsp, noreg, (address)the_pc);
2312
2313
2314  // We have all of the arguments setup at this point. We must not touch any register
2315  // argument registers at this point (what if we save/restore them there are no oop?
2316
2317  {
2318    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2319    // protect the args we've loaded
2320    save_args(masm, total_c_args, c_arg, out_regs);
2321    __ mov_metadata(c_rarg1, method());
2322    __ call_VM_leaf(
2323      CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2324      r15_thread, c_rarg1);
2325    restore_args(masm, total_c_args, c_arg, out_regs);
2326  }
2327
2328  // RedefineClasses() tracing support for obsolete method entry
2329  if (log_is_enabled(Trace, redefine, class, obsolete)) {
2330    // protect the args we've loaded
2331    save_args(masm, total_c_args, c_arg, out_regs);
2332    __ mov_metadata(c_rarg1, method());
2333    __ call_VM_leaf(
2334      CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2335      r15_thread, c_rarg1);
2336    restore_args(masm, total_c_args, c_arg, out_regs);
2337  }
2338
2339  // Lock a synchronized method
2340
2341  // Register definitions used by locking and unlocking
2342
2343  const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2344  const Register obj_reg  = rbx;  // Will contain the oop
2345  const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2346  const Register old_hdr  = r13;  // value of old header at unlock time
2347
2348  Label slow_path_lock;
2349  Label lock_done;
2350
2351  if (method->is_synchronized()) {
2352    assert(!is_critical_native, "unhandled");
2353
2354
2355    const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2356
2357    // Get the handle (the 2nd argument)
2358    __ mov(oop_handle_reg, c_rarg1);
2359
2360    // Get address of the box
2361
2362    __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2363
2364    // Load the oop from the handle
2365    __ movptr(obj_reg, Address(oop_handle_reg, 0));
2366
2367    if (UseBiasedLocking) {
2368      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, false, lock_done, &slow_path_lock);
2369    }
2370
2371    // Load immediate 1 into swap_reg %rax
2372    __ movl(swap_reg, 1);
2373
2374    // Load (object->mark() | 1) into swap_reg %rax
2375    __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2376
2377    // Save (object->mark() | 1) into BasicLock's displaced header
2378    __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2379
2380    if (os::is_MP()) {
2381      __ lock();
2382    }
2383
2384    // src -> dest iff dest == rax else rax <- dest
2385    __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2386    __ jcc(Assembler::equal, lock_done);
2387
2388    // Hmm should this move to the slow path code area???
2389
2390    // Test if the oopMark is an obvious stack pointer, i.e.,
2391    //  1) (mark & 3) == 0, and
2392    //  2) rsp <= mark < mark + os::pagesize()
2393    // These 3 tests can be done by evaluating the following
2394    // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2395    // assuming both stack pointer and pagesize have their
2396    // least significant 2 bits clear.
2397    // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2398
2399    __ subptr(swap_reg, rsp);
2400    __ andptr(swap_reg, 3 - os::vm_page_size());
2401
2402    // Save the test result, for recursive case, the result is zero
2403    __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2404    __ jcc(Assembler::notEqual, slow_path_lock);
2405
2406    // Slow path will re-enter here
2407
2408    __ bind(lock_done);
2409  }
2410
2411
2412  // Finally just about ready to make the JNI call
2413
2414
2415  // get JNIEnv* which is first argument to native
2416  if (!is_critical_native) {
2417    __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2418  }
2419
2420  // Now set thread in native
2421  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2422
2423  __ call(RuntimeAddress(native_func));
2424
2425  // Verify or restore cpu control state after JNI call
2426  __ restore_cpu_control_state_after_jni();
2427
2428  // Unpack native results.
2429  switch (ret_type) {
2430  case T_BOOLEAN: __ c2bool(rax);            break;
2431  case T_CHAR   : __ movzwl(rax, rax);      break;
2432  case T_BYTE   : __ sign_extend_byte (rax); break;
2433  case T_SHORT  : __ sign_extend_short(rax); break;
2434  case T_INT    : /* nothing to do */        break;
2435  case T_DOUBLE :
2436  case T_FLOAT  :
2437    // Result is in xmm0 we'll save as needed
2438    break;
2439  case T_ARRAY:                 // Really a handle
2440  case T_OBJECT:                // Really a handle
2441      break; // can't de-handlize until after safepoint check
2442  case T_VOID: break;
2443  case T_LONG: break;
2444  default       : ShouldNotReachHere();
2445  }
2446
2447  // Switch thread to "native transition" state before reading the synchronization state.
2448  // This additional state is necessary because reading and testing the synchronization
2449  // state is not atomic w.r.t. GC, as this scenario demonstrates:
2450  //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2451  //     VM thread changes sync state to synchronizing and suspends threads for GC.
2452  //     Thread A is resumed to finish this native method, but doesn't block here since it
2453  //     didn't see any synchronization is progress, and escapes.
2454  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2455
2456  if(os::is_MP()) {
2457    if (UseMembar) {
2458      // Force this write out before the read below
2459      __ membar(Assembler::Membar_mask_bits(
2460           Assembler::LoadLoad | Assembler::LoadStore |
2461           Assembler::StoreLoad | Assembler::StoreStore));
2462    } else {
2463      // Write serialization page so VM thread can do a pseudo remote membar.
2464      // We use the current thread pointer to calculate a thread specific
2465      // offset to write to within the page. This minimizes bus traffic
2466      // due to cache line collision.
2467      __ serialize_memory(r15_thread, rcx);
2468    }
2469  }
2470
2471  Label after_transition;
2472
2473  // check for safepoint operation in progress and/or pending suspend requests
2474  {
2475    Label Continue;
2476
2477    __ cmp32(ExternalAddress((address)SafepointSynchronize::address_of_state()),
2478             SafepointSynchronize::_not_synchronized);
2479
2480    Label L;
2481    __ jcc(Assembler::notEqual, L);
2482    __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2483    __ jcc(Assembler::equal, Continue);
2484    __ bind(L);
2485
2486    // Don't use call_VM as it will see a possible pending exception and forward it
2487    // and never return here preventing us from clearing _last_native_pc down below.
2488    // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2489    // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2490    // by hand.
2491    //
2492    __ vzeroupper();
2493    save_native_result(masm, ret_type, stack_slots);
2494    __ mov(c_rarg0, r15_thread);
2495    __ mov(r12, rsp); // remember sp
2496    __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2497    __ andptr(rsp, -16); // align stack as required by ABI
2498    if (!is_critical_native) {
2499      __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2500    } else {
2501      __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition)));
2502    }
2503    __ mov(rsp, r12); // restore sp
2504    __ reinit_heapbase();
2505    // Restore any method result value
2506    restore_native_result(masm, ret_type, stack_slots);
2507
2508    if (is_critical_native) {
2509      // The call above performed the transition to thread_in_Java so
2510      // skip the transition logic below.
2511      __ jmpb(after_transition);
2512    }
2513
2514    __ bind(Continue);
2515  }
2516
2517  // change thread state
2518  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2519  __ bind(after_transition);
2520
2521  Label reguard;
2522  Label reguard_done;
2523  __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), JavaThread::stack_guard_yellow_reserved_disabled);
2524  __ jcc(Assembler::equal, reguard);
2525  __ bind(reguard_done);
2526
2527  // native result if any is live
2528
2529  // Unlock
2530  Label unlock_done;
2531  Label slow_path_unlock;
2532  if (method->is_synchronized()) {
2533
2534    // Get locked oop from the handle we passed to jni
2535    __ movptr(obj_reg, Address(oop_handle_reg, 0));
2536
2537    Label done;
2538
2539    if (UseBiasedLocking) {
2540      __ biased_locking_exit(obj_reg, old_hdr, done);
2541    }
2542
2543    // Simple recursive lock?
2544
2545    __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2546    __ jcc(Assembler::equal, done);
2547
2548    // Must save rax if if it is live now because cmpxchg must use it
2549    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2550      save_native_result(masm, ret_type, stack_slots);
2551    }
2552
2553
2554    // get address of the stack lock
2555    __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2556    //  get old displaced header
2557    __ movptr(old_hdr, Address(rax, 0));
2558
2559    // Atomic swap old header if oop still contains the stack lock
2560    if (os::is_MP()) {
2561      __ lock();
2562    }
2563    __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2564    __ jcc(Assembler::notEqual, slow_path_unlock);
2565
2566    // slow path re-enters here
2567    __ bind(unlock_done);
2568    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2569      restore_native_result(masm, ret_type, stack_slots);
2570    }
2571
2572    __ bind(done);
2573
2574  }
2575  {
2576    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2577    save_native_result(masm, ret_type, stack_slots);
2578    __ mov_metadata(c_rarg1, method());
2579    __ call_VM_leaf(
2580         CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2581         r15_thread, c_rarg1);
2582    restore_native_result(masm, ret_type, stack_slots);
2583  }
2584
2585  __ reset_last_Java_frame(false);
2586
2587  // Unbox oop result, e.g. JNIHandles::resolve value.
2588  if (ret_type == T_OBJECT || ret_type == T_ARRAY) {
2589    __ resolve_jobject(rax /* value */,
2590                       r15_thread /* thread */,
2591                       rcx /* tmp */);
2592  }
2593
2594  if (CheckJNICalls) {
2595    // clear_pending_jni_exception_check
2596    __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2597  }
2598
2599  if (!is_critical_native) {
2600    // reset handle block
2601    __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2602    __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2603  }
2604
2605  // pop our frame
2606
2607  __ leave();
2608
2609  if (!is_critical_native) {
2610    // Any exception pending?
2611    __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2612    __ jcc(Assembler::notEqual, exception_pending);
2613  }
2614
2615  // Return
2616
2617  __ ret(0);
2618
2619  // Unexpected paths are out of line and go here
2620
2621  if (!is_critical_native) {
2622    // forward the exception
2623    __ bind(exception_pending);
2624
2625    // and forward the exception
2626    __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2627  }
2628
2629  // Slow path locking & unlocking
2630  if (method->is_synchronized()) {
2631
2632    // BEGIN Slow path lock
2633    __ bind(slow_path_lock);
2634
2635    // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2636    // args are (oop obj, BasicLock* lock, JavaThread* thread)
2637
2638    // protect the args we've loaded
2639    save_args(masm, total_c_args, c_arg, out_regs);
2640
2641    __ mov(c_rarg0, obj_reg);
2642    __ mov(c_rarg1, lock_reg);
2643    __ mov(c_rarg2, r15_thread);
2644
2645    // Not a leaf but we have last_Java_frame setup as we want
2646    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2647    restore_args(masm, total_c_args, c_arg, out_regs);
2648
2649#ifdef ASSERT
2650    { Label L;
2651    __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2652    __ jcc(Assembler::equal, L);
2653    __ stop("no pending exception allowed on exit from monitorenter");
2654    __ bind(L);
2655    }
2656#endif
2657    __ jmp(lock_done);
2658
2659    // END Slow path lock
2660
2661    // BEGIN Slow path unlock
2662    __ bind(slow_path_unlock);
2663
2664    // If we haven't already saved the native result we must save it now as xmm registers
2665    // are still exposed.
2666    __ vzeroupper();
2667    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2668      save_native_result(masm, ret_type, stack_slots);
2669    }
2670
2671    __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2672
2673    __ mov(c_rarg0, obj_reg);
2674    __ mov(c_rarg2, r15_thread);
2675    __ mov(r12, rsp); // remember sp
2676    __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2677    __ andptr(rsp, -16); // align stack as required by ABI
2678
2679    // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2680    // NOTE that obj_reg == rbx currently
2681    __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2682    __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2683
2684    // args are (oop obj, BasicLock* lock, JavaThread* thread)
2685    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2686    __ mov(rsp, r12); // restore sp
2687    __ reinit_heapbase();
2688#ifdef ASSERT
2689    {
2690      Label L;
2691      __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2692      __ jcc(Assembler::equal, L);
2693      __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2694      __ bind(L);
2695    }
2696#endif /* ASSERT */
2697
2698    __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2699
2700    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2701      restore_native_result(masm, ret_type, stack_slots);
2702    }
2703    __ jmp(unlock_done);
2704
2705    // END Slow path unlock
2706
2707  } // synchronized
2708
2709  // SLOW PATH Reguard the stack if needed
2710
2711  __ bind(reguard);
2712  __ vzeroupper();
2713  save_native_result(masm, ret_type, stack_slots);
2714  __ mov(r12, rsp); // remember sp
2715  __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2716  __ andptr(rsp, -16); // align stack as required by ABI
2717  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2718  __ mov(rsp, r12); // restore sp
2719  __ reinit_heapbase();
2720  restore_native_result(masm, ret_type, stack_slots);
2721  // and continue
2722  __ jmp(reguard_done);
2723
2724
2725
2726  __ flush();
2727
2728  nmethod *nm = nmethod::new_native_nmethod(method,
2729                                            compile_id,
2730                                            masm->code(),
2731                                            vep_offset,
2732                                            frame_complete,
2733                                            stack_slots / VMRegImpl::slots_per_word,
2734                                            (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2735                                            in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2736                                            oop_maps);
2737
2738  if (is_critical_native) {
2739    nm->set_lazy_critical_native(true);
2740  }
2741
2742  return nm;
2743
2744}
2745
2746// this function returns the adjust size (in number of words) to a c2i adapter
2747// activation for use during deoptimization
2748int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2749  return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2750}
2751
2752
2753uint SharedRuntime::out_preserve_stack_slots() {
2754  return 0;
2755}
2756
2757//------------------------------generate_deopt_blob----------------------------
2758void SharedRuntime::generate_deopt_blob() {
2759  // Allocate space for the code
2760  ResourceMark rm;
2761  // Setup code generation tools
2762  int pad = 0;
2763#if INCLUDE_JVMCI
2764  if (EnableJVMCI || UseAOT) {
2765    pad += 512; // Increase the buffer size when compiling for JVMCI
2766  }
2767#endif
2768  CodeBuffer buffer("deopt_blob", 2048+pad, 1024);
2769  MacroAssembler* masm = new MacroAssembler(&buffer);
2770  int frame_size_in_words;
2771  OopMap* map = NULL;
2772  OopMapSet *oop_maps = new OopMapSet();
2773
2774  // -------------
2775  // This code enters when returning to a de-optimized nmethod.  A return
2776  // address has been pushed on the the stack, and return values are in
2777  // registers.
2778  // If we are doing a normal deopt then we were called from the patched
2779  // nmethod from the point we returned to the nmethod. So the return
2780  // address on the stack is wrong by NativeCall::instruction_size
2781  // We will adjust the value so it looks like we have the original return
2782  // address on the stack (like when we eagerly deoptimized).
2783  // In the case of an exception pending when deoptimizing, we enter
2784  // with a return address on the stack that points after the call we patched
2785  // into the exception handler. We have the following register state from,
2786  // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2787  //    rax: exception oop
2788  //    rbx: exception handler
2789  //    rdx: throwing pc
2790  // So in this case we simply jam rdx into the useless return address and
2791  // the stack looks just like we want.
2792  //
2793  // At this point we need to de-opt.  We save the argument return
2794  // registers.  We call the first C routine, fetch_unroll_info().  This
2795  // routine captures the return values and returns a structure which
2796  // describes the current frame size and the sizes of all replacement frames.
2797  // The current frame is compiled code and may contain many inlined
2798  // functions, each with their own JVM state.  We pop the current frame, then
2799  // push all the new frames.  Then we call the C routine unpack_frames() to
2800  // populate these frames.  Finally unpack_frames() returns us the new target
2801  // address.  Notice that callee-save registers are BLOWN here; they have
2802  // already been captured in the vframeArray at the time the return PC was
2803  // patched.
2804  address start = __ pc();
2805  Label cont;
2806
2807  // Prolog for non exception case!
2808
2809  // Save everything in sight.
2810  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
2811
2812  // Normal deoptimization.  Save exec mode for unpack_frames.
2813  __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2814  __ jmp(cont);
2815
2816  int reexecute_offset = __ pc() - start;
2817#if INCLUDE_JVMCI && !defined(COMPILER1)
2818  if (EnableJVMCI && UseJVMCICompiler) {
2819    // JVMCI does not use this kind of deoptimization
2820    __ should_not_reach_here();
2821  }
2822#endif
2823
2824  // Reexecute case
2825  // return address is the pc describes what bci to do re-execute at
2826
2827  // No need to update map as each call to save_live_registers will produce identical oopmap
2828  (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
2829
2830  __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2831  __ jmp(cont);
2832
2833#if INCLUDE_JVMCI
2834  Label after_fetch_unroll_info_call;
2835  int implicit_exception_uncommon_trap_offset = 0;
2836  int uncommon_trap_offset = 0;
2837
2838  if (EnableJVMCI || UseAOT) {
2839    implicit_exception_uncommon_trap_offset = __ pc() - start;
2840
2841    __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2842    __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2843
2844    uncommon_trap_offset = __ pc() - start;
2845
2846    // Save everything in sight.
2847    RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
2848    // fetch_unroll_info needs to call last_java_frame()
2849    __ set_last_Java_frame(noreg, noreg, NULL);
2850
2851    __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2852    __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2853
2854    __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2855    __ mov(c_rarg0, r15_thread);
2856    __ movl(c_rarg2, r14); // exec mode
2857    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2858    oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2859
2860    __ reset_last_Java_frame(false);
2861
2862    __ jmp(after_fetch_unroll_info_call);
2863  } // EnableJVMCI
2864#endif // INCLUDE_JVMCI
2865
2866  int exception_offset = __ pc() - start;
2867
2868  // Prolog for exception case
2869
2870  // all registers are dead at this entry point, except for rax, and
2871  // rdx which contain the exception oop and exception pc
2872  // respectively.  Set them in TLS and fall thru to the
2873  // unpack_with_exception_in_tls entry point.
2874
2875  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2876  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2877
2878  int exception_in_tls_offset = __ pc() - start;
2879
2880  // new implementation because exception oop is now passed in JavaThread
2881
2882  // Prolog for exception case
2883  // All registers must be preserved because they might be used by LinearScan
2884  // Exceptiop oop and throwing PC are passed in JavaThread
2885  // tos: stack at point of call to method that threw the exception (i.e. only
2886  // args are on the stack, no return address)
2887
2888  // make room on stack for the return address
2889  // It will be patched later with the throwing pc. The correct value is not
2890  // available now because loading it from memory would destroy registers.
2891  __ push(0);
2892
2893  // Save everything in sight.
2894  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
2895
2896  // Now it is safe to overwrite any register
2897
2898  // Deopt during an exception.  Save exec mode for unpack_frames.
2899  __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2900
2901  // load throwing pc from JavaThread and patch it as the return address
2902  // of the current frame. Then clear the field in JavaThread
2903
2904  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2905  __ movptr(Address(rbp, wordSize), rdx);
2906  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2907
2908#ifdef ASSERT
2909  // verify that there is really an exception oop in JavaThread
2910  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2911  __ verify_oop(rax);
2912
2913  // verify that there is no pending exception
2914  Label no_pending_exception;
2915  __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2916  __ testptr(rax, rax);
2917  __ jcc(Assembler::zero, no_pending_exception);
2918  __ stop("must not have pending exception here");
2919  __ bind(no_pending_exception);
2920#endif
2921
2922  __ bind(cont);
2923
2924  // Call C code.  Need thread and this frame, but NOT official VM entry
2925  // crud.  We cannot block on this call, no GC can happen.
2926  //
2927  // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2928
2929  // fetch_unroll_info needs to call last_java_frame().
2930
2931  __ set_last_Java_frame(noreg, noreg, NULL);
2932#ifdef ASSERT
2933  { Label L;
2934    __ cmpptr(Address(r15_thread,
2935                    JavaThread::last_Java_fp_offset()),
2936            (int32_t)0);
2937    __ jcc(Assembler::equal, L);
2938    __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2939    __ bind(L);
2940  }
2941#endif // ASSERT
2942  __ mov(c_rarg0, r15_thread);
2943  __ movl(c_rarg1, r14); // exec_mode
2944  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2945
2946  // Need to have an oopmap that tells fetch_unroll_info where to
2947  // find any register it might need.
2948  oop_maps->add_gc_map(__ pc() - start, map);
2949
2950  __ reset_last_Java_frame(false);
2951
2952#if INCLUDE_JVMCI
2953  if (EnableJVMCI || UseAOT) {
2954    __ bind(after_fetch_unroll_info_call);
2955  }
2956#endif
2957
2958  // Load UnrollBlock* into rdi
2959  __ mov(rdi, rax);
2960
2961  __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2962   Label noException;
2963  __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2964  __ jcc(Assembler::notEqual, noException);
2965  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2966  // QQQ this is useless it was NULL above
2967  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2968  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2969  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2970
2971  __ verify_oop(rax);
2972
2973  // Overwrite the result registers with the exception results.
2974  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2975  // I think this is useless
2976  __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2977
2978  __ bind(noException);
2979
2980  // Only register save data is on the stack.
2981  // Now restore the result registers.  Everything else is either dead
2982  // or captured in the vframeArray.
2983  RegisterSaver::restore_result_registers(masm);
2984
2985  // All of the register save area has been popped of the stack. Only the
2986  // return address remains.
2987
2988  // Pop all the frames we must move/replace.
2989  //
2990  // Frame picture (youngest to oldest)
2991  // 1: self-frame (no frame link)
2992  // 2: deopting frame  (no frame link)
2993  // 3: caller of deopting frame (could be compiled/interpreted).
2994  //
2995  // Note: by leaving the return address of self-frame on the stack
2996  // and using the size of frame 2 to adjust the stack
2997  // when we are done the return to frame 3 will still be on the stack.
2998
2999  // Pop deoptimized frame
3000  __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
3001  __ addptr(rsp, rcx);
3002
3003  // rsp should be pointing at the return address to the caller (3)
3004
3005  // Pick up the initial fp we should save
3006  // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3007  __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
3008
3009#ifdef ASSERT
3010  // Compilers generate code that bang the stack by as much as the
3011  // interpreter would need. So this stack banging should never
3012  // trigger a fault. Verify that it does not on non product builds.
3013  if (UseStackBanging) {
3014    __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
3015    __ bang_stack_size(rbx, rcx);
3016  }
3017#endif
3018
3019  // Load address of array of frame pcs into rcx
3020  __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
3021
3022  // Trash the old pc
3023  __ addptr(rsp, wordSize);
3024
3025  // Load address of array of frame sizes into rsi
3026  __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
3027
3028  // Load counter into rdx
3029  __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
3030
3031  // Now adjust the caller's stack to make up for the extra locals
3032  // but record the original sp so that we can save it in the skeletal interpreter
3033  // frame and the stack walking of interpreter_sender will get the unextended sp
3034  // value and not the "real" sp value.
3035
3036  const Register sender_sp = r8;
3037
3038  __ mov(sender_sp, rsp);
3039  __ movl(rbx, Address(rdi,
3040                       Deoptimization::UnrollBlock::
3041                       caller_adjustment_offset_in_bytes()));
3042  __ subptr(rsp, rbx);
3043
3044  // Push interpreter frames in a loop
3045  Label loop;
3046  __ bind(loop);
3047  __ movptr(rbx, Address(rsi, 0));      // Load frame size
3048  __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3049  __ pushptr(Address(rcx, 0));          // Save return address
3050  __ enter();                           // Save old & set new ebp
3051  __ subptr(rsp, rbx);                  // Prolog
3052  // This value is corrected by layout_activation_impl
3053  __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
3054  __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3055  __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3056  __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3057  __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3058  __ decrementl(rdx);                   // Decrement counter
3059  __ jcc(Assembler::notZero, loop);
3060  __ pushptr(Address(rcx, 0));          // Save final return address
3061
3062  // Re-push self-frame
3063  __ enter();                           // Save old & set new ebp
3064
3065  // Allocate a full sized register save area.
3066  // Return address and rbp are in place, so we allocate two less words.
3067  __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3068
3069  // Restore frame locals after moving the frame
3070  __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3071  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3072
3073  // Call C code.  Need thread but NOT official VM entry
3074  // crud.  We cannot block on this call, no GC can happen.  Call should
3075  // restore return values to their stack-slots with the new SP.
3076  //
3077  // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3078
3079  // Use rbp because the frames look interpreted now
3080  // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3081  // Don't need the precise return PC here, just precise enough to point into this code blob.
3082  address the_pc = __ pc();
3083  __ set_last_Java_frame(noreg, rbp, the_pc);
3084
3085  __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3086  __ mov(c_rarg0, r15_thread);
3087  __ movl(c_rarg1, r14); // second arg: exec_mode
3088  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3089  // Revert SP alignment after call since we're going to do some SP relative addressing below
3090  __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3091
3092  // Set an oopmap for the call site
3093  // Use the same PC we used for the last java frame
3094  oop_maps->add_gc_map(the_pc - start,
3095                       new OopMap( frame_size_in_words, 0 ));
3096
3097  // Clear fp AND pc
3098  __ reset_last_Java_frame(true);
3099
3100  // Collect return values
3101  __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3102  __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3103  // I think this is useless (throwing pc?)
3104  __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3105
3106  // Pop self-frame.
3107  __ leave();                           // Epilog
3108
3109  // Jump to interpreter
3110  __ ret(0);
3111
3112  // Make sure all code is generated
3113  masm->flush();
3114
3115  _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3116  _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3117#if INCLUDE_JVMCI
3118  if (EnableJVMCI || UseAOT) {
3119    _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3120    _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3121  }
3122#endif
3123}
3124
3125#ifdef COMPILER2
3126//------------------------------generate_uncommon_trap_blob--------------------
3127void SharedRuntime::generate_uncommon_trap_blob() {
3128  // Allocate space for the code
3129  ResourceMark rm;
3130  // Setup code generation tools
3131  CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
3132  MacroAssembler* masm = new MacroAssembler(&buffer);
3133
3134  assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3135
3136  address start = __ pc();
3137
3138  if (UseRTMLocking) {
3139    // Abort RTM transaction before possible nmethod deoptimization.
3140    __ xabort(0);
3141  }
3142
3143  // Push self-frame.  We get here with a return address on the
3144  // stack, so rsp is 8-byte aligned until we allocate our frame.
3145  __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
3146
3147  // No callee saved registers. rbp is assumed implicitly saved
3148  __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3149
3150  // compiler left unloaded_class_index in j_rarg0 move to where the
3151  // runtime expects it.
3152  __ movl(c_rarg1, j_rarg0);
3153
3154  __ set_last_Java_frame(noreg, noreg, NULL);
3155
3156  // Call C code.  Need thread but NOT official VM entry
3157  // crud.  We cannot block on this call, no GC can happen.  Call should
3158  // capture callee-saved registers as well as return values.
3159  // Thread is in rdi already.
3160  //
3161  // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
3162
3163  __ mov(c_rarg0, r15_thread);
3164  __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
3165  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
3166
3167  // Set an oopmap for the call site
3168  OopMapSet* oop_maps = new OopMapSet();
3169  OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
3170
3171  // location of rbp is known implicitly by the frame sender code
3172
3173  oop_maps->add_gc_map(__ pc() - start, map);
3174
3175  __ reset_last_Java_frame(false);
3176
3177  // Load UnrollBlock* into rdi
3178  __ mov(rdi, rax);
3179
3180#ifdef ASSERT
3181  { Label L;
3182    __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
3183            (int32_t)Deoptimization::Unpack_uncommon_trap);
3184    __ jcc(Assembler::equal, L);
3185    __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
3186    __ bind(L);
3187  }
3188#endif
3189
3190  // Pop all the frames we must move/replace.
3191  //
3192  // Frame picture (youngest to oldest)
3193  // 1: self-frame (no frame link)
3194  // 2: deopting frame  (no frame link)
3195  // 3: caller of deopting frame (could be compiled/interpreted).
3196
3197  // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
3198  __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
3199
3200  // Pop deoptimized frame (int)
3201  __ movl(rcx, Address(rdi,
3202                       Deoptimization::UnrollBlock::
3203                       size_of_deoptimized_frame_offset_in_bytes()));
3204  __ addptr(rsp, rcx);
3205
3206  // rsp should be pointing at the return address to the caller (3)
3207
3208  // Pick up the initial fp we should save
3209  // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3210  __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
3211
3212#ifdef ASSERT
3213  // Compilers generate code that bang the stack by as much as the
3214  // interpreter would need. So this stack banging should never
3215  // trigger a fault. Verify that it does not on non product builds.
3216  if (UseStackBanging) {
3217    __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
3218    __ bang_stack_size(rbx, rcx);
3219  }
3220#endif
3221
3222  // Load address of array of frame pcs into rcx (address*)
3223  __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
3224
3225  // Trash the return pc
3226  __ addptr(rsp, wordSize);
3227
3228  // Load address of array of frame sizes into rsi (intptr_t*)
3229  __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
3230
3231  // Counter
3232  __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
3233
3234  // Now adjust the caller's stack to make up for the extra locals but
3235  // record the original sp so that we can save it in the skeletal
3236  // interpreter frame and the stack walking of interpreter_sender
3237  // will get the unextended sp value and not the "real" sp value.
3238
3239  const Register sender_sp = r8;
3240
3241  __ mov(sender_sp, rsp);
3242  __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
3243  __ subptr(rsp, rbx);
3244
3245  // Push interpreter frames in a loop
3246  Label loop;
3247  __ bind(loop);
3248  __ movptr(rbx, Address(rsi, 0)); // Load frame size
3249  __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3250  __ pushptr(Address(rcx, 0));     // Save return address
3251  __ enter();                      // Save old & set new rbp
3252  __ subptr(rsp, rbx);             // Prolog
3253  __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3254            sender_sp);            // Make it walkable
3255  // This value is corrected by layout_activation_impl
3256  __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
3257  __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3258  __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3259  __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3260  __ decrementl(rdx);              // Decrement counter
3261  __ jcc(Assembler::notZero, loop);
3262  __ pushptr(Address(rcx, 0));     // Save final return address
3263
3264  // Re-push self-frame
3265  __ enter();                 // Save old & set new rbp
3266  __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3267                              // Prolog
3268
3269  // Use rbp because the frames look interpreted now
3270  // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3271  // Don't need the precise return PC here, just precise enough to point into this code blob.
3272  address the_pc = __ pc();
3273  __ set_last_Java_frame(noreg, rbp, the_pc);
3274
3275  // Call C code.  Need thread but NOT official VM entry
3276  // crud.  We cannot block on this call, no GC can happen.  Call should
3277  // restore return values to their stack-slots with the new SP.
3278  // Thread is in rdi already.
3279  //
3280  // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3281
3282  __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3283  __ mov(c_rarg0, r15_thread);
3284  __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3285  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3286
3287  // Set an oopmap for the call site
3288  // Use the same PC we used for the last java frame
3289  oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3290
3291  // Clear fp AND pc
3292  __ reset_last_Java_frame(true);
3293
3294  // Pop self-frame.
3295  __ leave();                 // Epilog
3296
3297  // Jump to interpreter
3298  __ ret(0);
3299
3300  // Make sure all code is generated
3301  masm->flush();
3302
3303  _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3304                                                 SimpleRuntimeFrame::framesize >> 1);
3305}
3306#endif // COMPILER2
3307
3308
3309//------------------------------generate_handler_blob------
3310//
3311// Generate a special Compile2Runtime blob that saves all registers,
3312// and setup oopmap.
3313//
3314SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3315  assert(StubRoutines::forward_exception_entry() != NULL,
3316         "must be generated before");
3317
3318  ResourceMark rm;
3319  OopMapSet *oop_maps = new OopMapSet();
3320  OopMap* map;
3321
3322  // Allocate space for the code.  Setup code generation tools.
3323  CodeBuffer buffer("handler_blob", 2048, 1024);
3324  MacroAssembler* masm = new MacroAssembler(&buffer);
3325
3326  address start   = __ pc();
3327  address call_pc = NULL;
3328  int frame_size_in_words;
3329  bool cause_return = (poll_type == POLL_AT_RETURN);
3330  bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3331
3332  if (UseRTMLocking) {
3333    // Abort RTM transaction before calling runtime
3334    // because critical section will be large and will be
3335    // aborted anyway. Also nmethod could be deoptimized.
3336    __ xabort(0);
3337  }
3338
3339  // Make room for return address (or push it again)
3340  if (!cause_return) {
3341    __ push(rbx);
3342  }
3343
3344  // Save registers, fpu state, and flags
3345  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
3346
3347  // The following is basically a call_VM.  However, we need the precise
3348  // address of the call in order to generate an oopmap. Hence, we do all the
3349  // work outselves.
3350
3351  __ set_last_Java_frame(noreg, noreg, NULL);
3352
3353  // The return address must always be correct so that frame constructor never
3354  // sees an invalid pc.
3355
3356  if (!cause_return) {
3357    // overwrite the dummy value we pushed on entry
3358    __ movptr(c_rarg0, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3359    __ movptr(Address(rbp, wordSize), c_rarg0);
3360  }
3361
3362  // Do the call
3363  __ mov(c_rarg0, r15_thread);
3364  __ call(RuntimeAddress(call_ptr));
3365
3366  // Set an oopmap for the call site.  This oopmap will map all
3367  // oop-registers and debug-info registers as callee-saved.  This
3368  // will allow deoptimization at this safepoint to find all possible
3369  // debug-info recordings, as well as let GC find all oops.
3370
3371  oop_maps->add_gc_map( __ pc() - start, map);
3372
3373  Label noException;
3374
3375  __ reset_last_Java_frame(false);
3376
3377  __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3378  __ jcc(Assembler::equal, noException);
3379
3380  // Exception pending
3381
3382  RegisterSaver::restore_live_registers(masm, save_vectors);
3383
3384  __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3385
3386  // No exception case
3387  __ bind(noException);
3388
3389  // Normal exit, restore registers and exit.
3390  RegisterSaver::restore_live_registers(masm, save_vectors);
3391
3392  __ ret(0);
3393
3394  // Make sure all code is generated
3395  masm->flush();
3396
3397  // Fill-out other meta info
3398  return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3399}
3400
3401//
3402// generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3403//
3404// Generate a stub that calls into vm to find out the proper destination
3405// of a java call. All the argument registers are live at this point
3406// but since this is generic code we don't know what they are and the caller
3407// must do any gc of the args.
3408//
3409RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3410  assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3411
3412  // allocate space for the code
3413  ResourceMark rm;
3414
3415  CodeBuffer buffer(name, 1000, 512);
3416  MacroAssembler* masm                = new MacroAssembler(&buffer);
3417
3418  int frame_size_in_words;
3419
3420  OopMapSet *oop_maps = new OopMapSet();
3421  OopMap* map = NULL;
3422
3423  int start = __ offset();
3424
3425  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
3426
3427  int frame_complete = __ offset();
3428
3429  __ set_last_Java_frame(noreg, noreg, NULL);
3430
3431  __ mov(c_rarg0, r15_thread);
3432
3433  __ call(RuntimeAddress(destination));
3434
3435
3436  // Set an oopmap for the call site.
3437  // We need this not only for callee-saved registers, but also for volatile
3438  // registers that the compiler might be keeping live across a safepoint.
3439
3440  oop_maps->add_gc_map( __ offset() - start, map);
3441
3442  // rax contains the address we are going to jump to assuming no exception got installed
3443
3444  // clear last_Java_sp
3445  __ reset_last_Java_frame(false);
3446  // check for pending exceptions
3447  Label pending;
3448  __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3449  __ jcc(Assembler::notEqual, pending);
3450
3451  // get the returned Method*
3452  __ get_vm_result_2(rbx, r15_thread);
3453  __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3454
3455  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3456
3457  RegisterSaver::restore_live_registers(masm);
3458
3459  // We are back the the original state on entry and ready to go.
3460
3461  __ jmp(rax);
3462
3463  // Pending exception after the safepoint
3464
3465  __ bind(pending);
3466
3467  RegisterSaver::restore_live_registers(masm);
3468
3469  // exception pending => remove activation and forward to exception handler
3470
3471  __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3472
3473  __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3474  __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3475
3476  // -------------
3477  // make sure all code is generated
3478  masm->flush();
3479
3480  // return the  blob
3481  // frame_size_words or bytes??
3482  return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3483}
3484
3485
3486//------------------------------Montgomery multiplication------------------------
3487//
3488
3489#ifndef _WINDOWS
3490
3491#define ASM_SUBTRACT
3492
3493#ifdef ASM_SUBTRACT
3494// Subtract 0:b from carry:a.  Return carry.
3495static unsigned long
3496sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
3497  long i = 0, cnt = len;
3498  unsigned long tmp;
3499  asm volatile("clc; "
3500               "0: ; "
3501               "mov (%[b], %[i], 8), %[tmp]; "
3502               "sbb %[tmp], (%[a], %[i], 8); "
3503               "inc %[i]; dec %[cnt]; "
3504               "jne 0b; "
3505               "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3506               : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3507               : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3508               : "memory");
3509  return tmp;
3510}
3511#else // ASM_SUBTRACT
3512typedef int __attribute__((mode(TI))) int128;
3513
3514// Subtract 0:b from carry:a.  Return carry.
3515static unsigned long
3516sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
3517  int128 tmp = 0;
3518  int i;
3519  for (i = 0; i < len; i++) {
3520    tmp += a[i];
3521    tmp -= b[i];
3522    a[i] = tmp;
3523    tmp >>= 64;
3524    assert(-1 <= tmp && tmp <= 0, "invariant");
3525  }
3526  return tmp + carry;
3527}
3528#endif // ! ASM_SUBTRACT
3529
3530// Multiply (unsigned) Long A by Long B, accumulating the double-
3531// length result into the accumulator formed of T0, T1, and T2.
3532#define MACC(A, B, T0, T1, T2)                                  \
3533do {                                                            \
3534  unsigned long hi, lo;                                         \
3535  __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3536           : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3537           : "r"(A), "a"(B) : "cc");                            \
3538 } while(0)
3539
3540// As above, but add twice the double-length result into the
3541// accumulator.
3542#define MACC2(A, B, T0, T1, T2)                                 \
3543do {                                                            \
3544  unsigned long hi, lo;                                         \
3545  __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3546           "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3547           : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3548           : "r"(A), "a"(B) : "cc");                            \
3549 } while(0)
3550
3551// Fast Montgomery multiplication.  The derivation of the algorithm is
3552// in  A Cryptographic Library for the Motorola DSP56000,
3553// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3554
3555static void __attribute__((noinline))
3556montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
3557                    unsigned long m[], unsigned long inv, int len) {
3558  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3559  int i;
3560
3561  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3562
3563  for (i = 0; i < len; i++) {
3564    int j;
3565    for (j = 0; j < i; j++) {
3566      MACC(a[j], b[i-j], t0, t1, t2);
3567      MACC(m[j], n[i-j], t0, t1, t2);
3568    }
3569    MACC(a[i], b[0], t0, t1, t2);
3570    m[i] = t0 * inv;
3571    MACC(m[i], n[0], t0, t1, t2);
3572
3573    assert(t0 == 0, "broken Montgomery multiply");
3574
3575    t0 = t1; t1 = t2; t2 = 0;
3576  }
3577
3578  for (i = len; i < 2*len; i++) {
3579    int j;
3580    for (j = i-len+1; j < len; j++) {
3581      MACC(a[j], b[i-j], t0, t1, t2);
3582      MACC(m[j], n[i-j], t0, t1, t2);
3583    }
3584    m[i-len] = t0;
3585    t0 = t1; t1 = t2; t2 = 0;
3586  }
3587
3588  while (t0)
3589    t0 = sub(m, n, t0, len);
3590}
3591
3592// Fast Montgomery squaring.  This uses asymptotically 25% fewer
3593// multiplies so it should be up to 25% faster than Montgomery
3594// multiplication.  However, its loop control is more complex and it
3595// may actually run slower on some machines.
3596
3597static void __attribute__((noinline))
3598montgomery_square(unsigned long a[], unsigned long n[],
3599                  unsigned long m[], unsigned long inv, int len) {
3600  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3601  int i;
3602
3603  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3604
3605  for (i = 0; i < len; i++) {
3606    int j;
3607    int end = (i+1)/2;
3608    for (j = 0; j < end; j++) {
3609      MACC2(a[j], a[i-j], t0, t1, t2);
3610      MACC(m[j], n[i-j], t0, t1, t2);
3611    }
3612    if ((i & 1) == 0) {
3613      MACC(a[j], a[j], t0, t1, t2);
3614    }
3615    for (; j < i; j++) {
3616      MACC(m[j], n[i-j], t0, t1, t2);
3617    }
3618    m[i] = t0 * inv;
3619    MACC(m[i], n[0], t0, t1, t2);
3620
3621    assert(t0 == 0, "broken Montgomery square");
3622
3623    t0 = t1; t1 = t2; t2 = 0;
3624  }
3625
3626  for (i = len; i < 2*len; i++) {
3627    int start = i-len+1;
3628    int end = start + (len - start)/2;
3629    int j;
3630    for (j = start; j < end; j++) {
3631      MACC2(a[j], a[i-j], t0, t1, t2);
3632      MACC(m[j], n[i-j], t0, t1, t2);
3633    }
3634    if ((i & 1) == 0) {
3635      MACC(a[j], a[j], t0, t1, t2);
3636    }
3637    for (; j < len; j++) {
3638      MACC(m[j], n[i-j], t0, t1, t2);
3639    }
3640    m[i-len] = t0;
3641    t0 = t1; t1 = t2; t2 = 0;
3642  }
3643
3644  while (t0)
3645    t0 = sub(m, n, t0, len);
3646}
3647
3648// Swap words in a longword.
3649static unsigned long swap(unsigned long x) {
3650  return (x << 32) | (x >> 32);
3651}
3652
3653// Copy len longwords from s to d, word-swapping as we go.  The
3654// destination array is reversed.
3655static void reverse_words(unsigned long *s, unsigned long *d, int len) {
3656  d += len;
3657  while(len-- > 0) {
3658    d--;
3659    *d = swap(*s);
3660    s++;
3661  }
3662}
3663
3664// The threshold at which squaring is advantageous was determined
3665// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3666#define MONTGOMERY_SQUARING_THRESHOLD 64
3667
3668void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3669                                        jint len, jlong inv,
3670                                        jint *m_ints) {
3671  assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3672  int longwords = len/2;
3673
3674  // Make very sure we don't use so much space that the stack might
3675  // overflow.  512 jints corresponds to an 16384-bit integer and
3676  // will use here a total of 8k bytes of stack space.
3677  int total_allocation = longwords * sizeof (unsigned long) * 4;
3678  guarantee(total_allocation <= 8192, "must be");
3679  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
3680
3681  // Local scratch arrays
3682  unsigned long
3683    *a = scratch + 0 * longwords,
3684    *b = scratch + 1 * longwords,
3685    *n = scratch + 2 * longwords,
3686    *m = scratch + 3 * longwords;
3687
3688  reverse_words((unsigned long *)a_ints, a, longwords);
3689  reverse_words((unsigned long *)b_ints, b, longwords);
3690  reverse_words((unsigned long *)n_ints, n, longwords);
3691
3692  ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
3693
3694  reverse_words(m, (unsigned long *)m_ints, longwords);
3695}
3696
3697void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3698                                      jint len, jlong inv,
3699                                      jint *m_ints) {
3700  assert(len % 2 == 0, "array length in montgomery_square must be even");
3701  int longwords = len/2;
3702
3703  // Make very sure we don't use so much space that the stack might
3704  // overflow.  512 jints corresponds to an 16384-bit integer and
3705  // will use here a total of 6k bytes of stack space.
3706  int total_allocation = longwords * sizeof (unsigned long) * 3;
3707  guarantee(total_allocation <= 8192, "must be");
3708  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
3709
3710  // Local scratch arrays
3711  unsigned long
3712    *a = scratch + 0 * longwords,
3713    *n = scratch + 1 * longwords,
3714    *m = scratch + 2 * longwords;
3715
3716  reverse_words((unsigned long *)a_ints, a, longwords);
3717  reverse_words((unsigned long *)n_ints, n, longwords);
3718
3719  if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3720    ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
3721  } else {
3722    ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
3723  }
3724
3725  reverse_words(m, (unsigned long *)m_ints, longwords);
3726}
3727
3728#endif // WINDOWS
3729
3730#ifdef COMPILER2
3731// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3732//
3733//------------------------------generate_exception_blob---------------------------
3734// creates exception blob at the end
3735// Using exception blob, this code is jumped from a compiled method.
3736// (see emit_exception_handler in x86_64.ad file)
3737//
3738// Given an exception pc at a call we call into the runtime for the
3739// handler in this method. This handler might merely restore state
3740// (i.e. callee save registers) unwind the frame and jump to the
3741// exception handler for the nmethod if there is no Java level handler
3742// for the nmethod.
3743//
3744// This code is entered with a jmp.
3745//
3746// Arguments:
3747//   rax: exception oop
3748//   rdx: exception pc
3749//
3750// Results:
3751//   rax: exception oop
3752//   rdx: exception pc in caller or ???
3753//   destination: exception handler of caller
3754//
3755// Note: the exception pc MUST be at a call (precise debug information)
3756//       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3757//
3758
3759void OptoRuntime::generate_exception_blob() {
3760  assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3761  assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3762  assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3763
3764  assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3765
3766  // Allocate space for the code
3767  ResourceMark rm;
3768  // Setup code generation tools
3769  CodeBuffer buffer("exception_blob", 2048, 1024);
3770  MacroAssembler* masm = new MacroAssembler(&buffer);
3771
3772
3773  address start = __ pc();
3774
3775  // Exception pc is 'return address' for stack walker
3776  __ push(rdx);
3777  __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3778
3779  // Save callee-saved registers.  See x86_64.ad.
3780
3781  // rbp is an implicitly saved callee saved register (i.e., the calling
3782  // convention will save/restore it in the prolog/epilog). Other than that
3783  // there are no callee save registers now that adapter frames are gone.
3784
3785  __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3786
3787  // Store exception in Thread object. We cannot pass any arguments to the
3788  // handle_exception call, since we do not want to make any assumption
3789  // about the size of the frame where the exception happened in.
3790  // c_rarg0 is either rdi (Linux) or rcx (Windows).
3791  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3792  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3793
3794  // This call does all the hard work.  It checks if an exception handler
3795  // exists in the method.
3796  // If so, it returns the handler address.
3797  // If not, it prepares for stack-unwinding, restoring the callee-save
3798  // registers of the frame being removed.
3799  //
3800  // address OptoRuntime::handle_exception_C(JavaThread* thread)
3801
3802  // At a method handle call, the stack may not be properly aligned
3803  // when returning with an exception.
3804  address the_pc = __ pc();
3805  __ set_last_Java_frame(noreg, noreg, the_pc);
3806  __ mov(c_rarg0, r15_thread);
3807  __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3808  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3809
3810  // Set an oopmap for the call site.  This oopmap will only be used if we
3811  // are unwinding the stack.  Hence, all locations will be dead.
3812  // Callee-saved registers will be the same as the frame above (i.e.,
3813  // handle_exception_stub), since they were restored when we got the
3814  // exception.
3815
3816  OopMapSet* oop_maps = new OopMapSet();
3817
3818  oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3819
3820  __ reset_last_Java_frame(false);
3821
3822  // Restore callee-saved registers
3823
3824  // rbp is an implicitly saved callee-saved register (i.e., the calling
3825  // convention will save restore it in prolog/epilog) Other than that
3826  // there are no callee save registers now that adapter frames are gone.
3827
3828  __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3829
3830  __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3831  __ pop(rdx);                  // No need for exception pc anymore
3832
3833  // rax: exception handler
3834
3835  // We have a handler in rax (could be deopt blob).
3836  __ mov(r8, rax);
3837
3838  // Get the exception oop
3839  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3840  // Get the exception pc in case we are deoptimized
3841  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3842#ifdef ASSERT
3843  __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3844  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3845#endif
3846  // Clear the exception oop so GC no longer processes it as a root.
3847  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3848
3849  // rax: exception oop
3850  // r8:  exception handler
3851  // rdx: exception pc
3852  // Jump to handler
3853
3854  __ jmp(r8);
3855
3856  // Make sure all code is generated
3857  masm->flush();
3858
3859  // Set exception blob
3860  _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3861}
3862#endif // COMPILER2
3863