1/*
2 * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#ifndef _WINDOWS
27#include "alloca.h"
28#endif
29#include "asm/macroAssembler.hpp"
30#include "asm/macroAssembler.inline.hpp"
31#include "code/debugInfoRec.hpp"
32#include "code/icBuffer.hpp"
33#include "code/vtableStubs.hpp"
34#include "interpreter/interpreter.hpp"
35#include "logging/log.hpp"
36#include "memory/resourceArea.hpp"
37#include "oops/compiledICHolder.hpp"
38#include "runtime/sharedRuntime.hpp"
39#include "runtime/vframeArray.hpp"
40#include "utilities/align.hpp"
41#include "vm_version_x86.hpp"
42#include "vmreg_x86.inline.hpp"
43#ifdef COMPILER1
44#include "c1/c1_Runtime1.hpp"
45#endif
46#ifdef COMPILER2
47#include "opto/runtime.hpp"
48#endif
49#if INCLUDE_JVMCI
50#include "jvmci/jvmciJavaClasses.hpp"
51#endif
52
53#define __ masm->
54
55const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
56
57class SimpleRuntimeFrame {
58
59  public:
60
61  // Most of the runtime stubs have this simple frame layout.
62  // This class exists to make the layout shared in one place.
63  // Offsets are for compiler stack slots, which are jints.
64  enum layout {
65    // The frame sender code expects that rbp will be in the "natural" place and
66    // will override any oopMap setting for it. We must therefore force the layout
67    // so that it agrees with the frame sender code.
68    rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
69    rbp_off2,
70    return_off, return_off2,
71    framesize
72  };
73};
74
75class RegisterSaver {
76  // Capture info about frame layout.  Layout offsets are in jint
77  // units because compiler frame slots are jints.
78#define XSAVE_AREA_BEGIN 160
79#define XSAVE_AREA_YMM_BEGIN 576
80#define XSAVE_AREA_ZMM_BEGIN 1152
81#define XSAVE_AREA_UPPERBANK 1664
82#define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
83#define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
84#define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
85  enum layout {
86    fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
87    xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
88    DEF_XMM_OFFS(0),
89    DEF_XMM_OFFS(1),
90    // 2..15 are implied in range usage
91    ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
92    DEF_YMM_OFFS(0),
93    DEF_YMM_OFFS(1),
94    // 2..15 are implied in range usage
95    zmm_high = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
96    zmm_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
97    DEF_ZMM_OFFS(16),
98    DEF_ZMM_OFFS(17),
99    // 18..31 are implied in range usage
100    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
101    fpu_stateH_end,
102    r15_off, r15H_off,
103    r14_off, r14H_off,
104    r13_off, r13H_off,
105    r12_off, r12H_off,
106    r11_off, r11H_off,
107    r10_off, r10H_off,
108    r9_off,  r9H_off,
109    r8_off,  r8H_off,
110    rdi_off, rdiH_off,
111    rsi_off, rsiH_off,
112    ignore_off, ignoreH_off,  // extra copy of rbp
113    rsp_off, rspH_off,
114    rbx_off, rbxH_off,
115    rdx_off, rdxH_off,
116    rcx_off, rcxH_off,
117    rax_off, raxH_off,
118    // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
119    align_off, alignH_off,
120    flags_off, flagsH_off,
121    // The frame sender code expects that rbp will be in the "natural" place and
122    // will override any oopMap setting for it. We must therefore force the layout
123    // so that it agrees with the frame sender code.
124    rbp_off, rbpH_off,        // copy of rbp we will restore
125    return_off, returnH_off,  // slot for return address
126    reg_save_size             // size in compiler stack slots
127  };
128
129 public:
130  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors = false);
131  static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
132
133  // Offsets into the register save area
134  // Used by deoptimization when it is managing result register
135  // values on its own
136
137  static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
138  static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
139  static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
140  static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
141  static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
142
143  // During deoptimization only the result registers need to be restored,
144  // all the other values have already been extracted.
145  static void restore_result_registers(MacroAssembler* masm);
146};
147
148OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
149  int off = 0;
150  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
151  if (UseAVX < 3) {
152    num_xmm_regs = num_xmm_regs/2;
153  }
154#if defined(COMPILER2) || INCLUDE_JVMCI
155  if (save_vectors) {
156    assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
157    assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
158  }
159#else
160  assert(!save_vectors, "vectors are generated only by C2 and JVMCI");
161#endif
162
163  // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
164  int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
165  // OopMap frame size is in compiler stack slots (jint's) not bytes or words
166  int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
167  // CodeBlob frame size is in words.
168  int frame_size_in_words = frame_size_in_bytes / wordSize;
169  *total_frame_words = frame_size_in_words;
170
171  // Save registers, fpu state, and flags.
172  // We assume caller has already pushed the return address onto the
173  // stack, so rsp is 8-byte aligned here.
174  // We push rpb twice in this sequence because we want the real rbp
175  // to be under the return like a normal enter.
176
177  __ enter();          // rsp becomes 16-byte aligned here
178  __ push_CPU_state(); // Push a multiple of 16 bytes
179
180  // push cpu state handles this on EVEX enabled targets
181  if (save_vectors) {
182    // Save upper half of YMM registers(0..15)
183    int base_addr = XSAVE_AREA_YMM_BEGIN;
184    for (int n = 0; n < 16; n++) {
185      __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
186    }
187    if (VM_Version::supports_evex()) {
188      // Save upper half of ZMM registers(0..15)
189      base_addr = XSAVE_AREA_ZMM_BEGIN;
190      for (int n = 0; n < 16; n++) {
191        __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
192      }
193      // Save full ZMM registers(16..num_xmm_regs)
194      base_addr = XSAVE_AREA_UPPERBANK;
195      off = 0;
196      int vector_len = Assembler::AVX_512bit;
197      for (int n = 16; n < num_xmm_regs; n++) {
198        __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
199      }
200    }
201  } else {
202    if (VM_Version::supports_evex()) {
203      // Save upper bank of ZMM registers(16..31) for double/float usage
204      int base_addr = XSAVE_AREA_UPPERBANK;
205      off = 0;
206      for (int n = 16; n < num_xmm_regs; n++) {
207        __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
208      }
209    }
210  }
211  __ vzeroupper();
212  if (frame::arg_reg_save_area_bytes != 0) {
213    // Allocate argument register save area
214    __ subptr(rsp, frame::arg_reg_save_area_bytes);
215  }
216
217  // Set an oopmap for the call site.  This oopmap will map all
218  // oop-registers and debug-info registers as callee-saved.  This
219  // will allow deoptimization at this safepoint to find all possible
220  // debug-info recordings, as well as let GC find all oops.
221
222  OopMapSet *oop_maps = new OopMapSet();
223  OopMap* map = new OopMap(frame_size_in_slots, 0);
224
225#define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
226
227  map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
228  map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
229  map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
230  map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
231  // rbp location is known implicitly by the frame sender code, needs no oopmap
232  // and the location where rbp was saved by is ignored
233  map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
234  map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
235  map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
236  map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
237  map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
238  map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
239  map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
240  map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
241  map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
242  map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
243  // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
244  // on EVEX enabled targets, we get it included in the xsave area
245  off = xmm0_off;
246  int delta = xmm1_off - off;
247  for (int n = 0; n < 16; n++) {
248    XMMRegister xmm_name = as_XMMRegister(n);
249    map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
250    off += delta;
251  }
252  if(UseAVX > 2) {
253    // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
254    off = zmm16_off;
255    delta = zmm17_off - off;
256    for (int n = 16; n < num_xmm_regs; n++) {
257      XMMRegister zmm_name = as_XMMRegister(n);
258      map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
259      off += delta;
260    }
261  }
262
263#if defined(COMPILER2) || INCLUDE_JVMCI
264  if (save_vectors) {
265    off = ymm0_off;
266    int delta = ymm1_off - off;
267    for (int n = 0; n < 16; n++) {
268      XMMRegister ymm_name = as_XMMRegister(n);
269      map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
270      off += delta;
271    }
272  }
273#endif // COMPILER2 || INCLUDE_JVMCI
274
275  // %%% These should all be a waste but we'll keep things as they were for now
276  if (true) {
277    map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
278    map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
279    map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
280    map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
281    // rbp location is known implicitly by the frame sender code, needs no oopmap
282    map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
283    map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
284    map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
285    map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
286    map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
287    map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
288    map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
289    map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
290    map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
291    map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
292    // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
293    // on EVEX enabled targets, we get it included in the xsave area
294    off = xmm0H_off;
295    delta = xmm1H_off - off;
296    for (int n = 0; n < 16; n++) {
297      XMMRegister xmm_name = as_XMMRegister(n);
298      map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
299      off += delta;
300    }
301    if (UseAVX > 2) {
302      // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
303      off = zmm16H_off;
304      delta = zmm17H_off - off;
305      for (int n = 16; n < num_xmm_regs; n++) {
306        XMMRegister zmm_name = as_XMMRegister(n);
307        map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
308        off += delta;
309      }
310    }
311  }
312
313  return map;
314}
315
316void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
317  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
318  if (UseAVX < 3) {
319    num_xmm_regs = num_xmm_regs/2;
320  }
321  if (frame::arg_reg_save_area_bytes != 0) {
322    // Pop arg register save area
323    __ addptr(rsp, frame::arg_reg_save_area_bytes);
324  }
325
326#if defined(COMPILER2) || INCLUDE_JVMCI
327  if (restore_vectors) {
328    assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
329    assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
330  }
331#else
332  assert(!restore_vectors, "vectors are generated only by C2");
333#endif
334
335  __ vzeroupper();
336
337  // On EVEX enabled targets everything is handled in pop fpu state
338  if (restore_vectors) {
339    // Restore upper half of YMM registers (0..15)
340    int base_addr = XSAVE_AREA_YMM_BEGIN;
341    for (int n = 0; n < 16; n++) {
342      __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
343    }
344    if (VM_Version::supports_evex()) {
345      // Restore upper half of ZMM registers (0..15)
346      base_addr = XSAVE_AREA_ZMM_BEGIN;
347      for (int n = 0; n < 16; n++) {
348        __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
349      }
350      // Restore full ZMM registers(16..num_xmm_regs)
351      base_addr = XSAVE_AREA_UPPERBANK;
352      int vector_len = Assembler::AVX_512bit;
353      int off = 0;
354      for (int n = 16; n < num_xmm_regs; n++) {
355        __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
356      }
357    }
358  } else {
359    if (VM_Version::supports_evex()) {
360      // Restore upper bank of ZMM registers(16..31) for double/float usage
361      int base_addr = XSAVE_AREA_UPPERBANK;
362      int off = 0;
363      for (int n = 16; n < num_xmm_regs; n++) {
364        __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)));
365      }
366    }
367  }
368
369  // Recover CPU state
370  __ pop_CPU_state();
371  // Get the rbp described implicitly by the calling convention (no oopMap)
372  __ pop(rbp);
373}
374
375void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
376
377  // Just restore result register. Only used by deoptimization. By
378  // now any callee save register that needs to be restored to a c2
379  // caller of the deoptee has been extracted into the vframeArray
380  // and will be stuffed into the c2i adapter we create for later
381  // restoration so only result registers need to be restored here.
382
383  // Restore fp result register
384  __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
385  // Restore integer result register
386  __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
387  __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
388
389  // Pop all of the register save are off the stack except the return address
390  __ addptr(rsp, return_offset_in_bytes());
391}
392
393// Is vector's size (in bytes) bigger than a size saved by default?
394// 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
395bool SharedRuntime::is_wide_vector(int size) {
396  return size > 16;
397}
398
399size_t SharedRuntime::trampoline_size() {
400  return 16;
401}
402
403void SharedRuntime::generate_trampoline(MacroAssembler *masm, address destination) {
404  __ jump(RuntimeAddress(destination));
405}
406
407// The java_calling_convention describes stack locations as ideal slots on
408// a frame with no abi restrictions. Since we must observe abi restrictions
409// (like the placement of the register window) the slots must be biased by
410// the following value.
411static int reg2offset_in(VMReg r) {
412  // Account for saved rbp and return address
413  // This should really be in_preserve_stack_slots
414  return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
415}
416
417static int reg2offset_out(VMReg r) {
418  return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
419}
420
421// ---------------------------------------------------------------------------
422// Read the array of BasicTypes from a signature, and compute where the
423// arguments should go.  Values in the VMRegPair regs array refer to 4-byte
424// quantities.  Values less than VMRegImpl::stack0 are registers, those above
425// refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
426// as framesizes are fixed.
427// VMRegImpl::stack0 refers to the first slot 0(sp).
428// and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
429// up to RegisterImpl::number_of_registers) are the 64-bit
430// integer registers.
431
432// Note: the INPUTS in sig_bt are in units of Java argument words, which are
433// either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
434// units regardless of build. Of course for i486 there is no 64 bit build
435
436// The Java calling convention is a "shifted" version of the C ABI.
437// By skipping the first C ABI register we can call non-static jni methods
438// with small numbers of arguments without having to shuffle the arguments
439// at all. Since we control the java ABI we ought to at least get some
440// advantage out of it.
441
442int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
443                                           VMRegPair *regs,
444                                           int total_args_passed,
445                                           int is_outgoing) {
446
447  // Create the mapping between argument positions and
448  // registers.
449  static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
450    j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
451  };
452  static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
453    j_farg0, j_farg1, j_farg2, j_farg3,
454    j_farg4, j_farg5, j_farg6, j_farg7
455  };
456
457
458  uint int_args = 0;
459  uint fp_args = 0;
460  uint stk_args = 0; // inc by 2 each time
461
462  for (int i = 0; i < total_args_passed; i++) {
463    switch (sig_bt[i]) {
464    case T_BOOLEAN:
465    case T_CHAR:
466    case T_BYTE:
467    case T_SHORT:
468    case T_INT:
469      if (int_args < Argument::n_int_register_parameters_j) {
470        regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
471      } else {
472        regs[i].set1(VMRegImpl::stack2reg(stk_args));
473        stk_args += 2;
474      }
475      break;
476    case T_VOID:
477      // halves of T_LONG or T_DOUBLE
478      assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
479      regs[i].set_bad();
480      break;
481    case T_LONG:
482      assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
483      // fall through
484    case T_OBJECT:
485    case T_ARRAY:
486    case T_ADDRESS:
487      if (int_args < Argument::n_int_register_parameters_j) {
488        regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
489      } else {
490        regs[i].set2(VMRegImpl::stack2reg(stk_args));
491        stk_args += 2;
492      }
493      break;
494    case T_FLOAT:
495      if (fp_args < Argument::n_float_register_parameters_j) {
496        regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
497      } else {
498        regs[i].set1(VMRegImpl::stack2reg(stk_args));
499        stk_args += 2;
500      }
501      break;
502    case T_DOUBLE:
503      assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
504      if (fp_args < Argument::n_float_register_parameters_j) {
505        regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
506      } else {
507        regs[i].set2(VMRegImpl::stack2reg(stk_args));
508        stk_args += 2;
509      }
510      break;
511    default:
512      ShouldNotReachHere();
513      break;
514    }
515  }
516
517  return align_up(stk_args, 2);
518}
519
520// Patch the callers callsite with entry to compiled code if it exists.
521static void patch_callers_callsite(MacroAssembler *masm) {
522  Label L;
523  __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
524  __ jcc(Assembler::equal, L);
525
526  // Save the current stack pointer
527  __ mov(r13, rsp);
528  // Schedule the branch target address early.
529  // Call into the VM to patch the caller, then jump to compiled callee
530  // rax isn't live so capture return address while we easily can
531  __ movptr(rax, Address(rsp, 0));
532
533  // align stack so push_CPU_state doesn't fault
534  __ andptr(rsp, -(StackAlignmentInBytes));
535  __ push_CPU_state();
536  __ vzeroupper();
537  // VM needs caller's callsite
538  // VM needs target method
539  // This needs to be a long call since we will relocate this adapter to
540  // the codeBuffer and it may not reach
541
542  // Allocate argument register save area
543  if (frame::arg_reg_save_area_bytes != 0) {
544    __ subptr(rsp, frame::arg_reg_save_area_bytes);
545  }
546  __ mov(c_rarg0, rbx);
547  __ mov(c_rarg1, rax);
548  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
549
550  // De-allocate argument register save area
551  if (frame::arg_reg_save_area_bytes != 0) {
552    __ addptr(rsp, frame::arg_reg_save_area_bytes);
553  }
554
555  __ vzeroupper();
556  __ pop_CPU_state();
557  // restore sp
558  __ mov(rsp, r13);
559  __ bind(L);
560}
561
562
563static void gen_c2i_adapter(MacroAssembler *masm,
564                            int total_args_passed,
565                            int comp_args_on_stack,
566                            const BasicType *sig_bt,
567                            const VMRegPair *regs,
568                            Label& skip_fixup) {
569  // Before we get into the guts of the C2I adapter, see if we should be here
570  // at all.  We've come from compiled code and are attempting to jump to the
571  // interpreter, which means the caller made a static call to get here
572  // (vcalls always get a compiled target if there is one).  Check for a
573  // compiled target.  If there is one, we need to patch the caller's call.
574  patch_callers_callsite(masm);
575
576  __ bind(skip_fixup);
577
578  // Since all args are passed on the stack, total_args_passed *
579  // Interpreter::stackElementSize is the space we need. Plus 1 because
580  // we also account for the return address location since
581  // we store it first rather than hold it in rax across all the shuffling
582
583  int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
584
585  // stack is aligned, keep it that way
586  extraspace = align_up(extraspace, 2*wordSize);
587
588  // Get return address
589  __ pop(rax);
590
591  // set senderSP value
592  __ mov(r13, rsp);
593
594  __ subptr(rsp, extraspace);
595
596  // Store the return address in the expected location
597  __ movptr(Address(rsp, 0), rax);
598
599  // Now write the args into the outgoing interpreter space
600  for (int i = 0; i < total_args_passed; i++) {
601    if (sig_bt[i] == T_VOID) {
602      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
603      continue;
604    }
605
606    // offset to start parameters
607    int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
608    int next_off = st_off - Interpreter::stackElementSize;
609
610    // Say 4 args:
611    // i   st_off
612    // 0   32 T_LONG
613    // 1   24 T_VOID
614    // 2   16 T_OBJECT
615    // 3    8 T_BOOL
616    // -    0 return address
617    //
618    // However to make thing extra confusing. Because we can fit a long/double in
619    // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
620    // leaves one slot empty and only stores to a single slot. In this case the
621    // slot that is occupied is the T_VOID slot. See I said it was confusing.
622
623    VMReg r_1 = regs[i].first();
624    VMReg r_2 = regs[i].second();
625    if (!r_1->is_valid()) {
626      assert(!r_2->is_valid(), "");
627      continue;
628    }
629    if (r_1->is_stack()) {
630      // memory to memory use rax
631      int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
632      if (!r_2->is_valid()) {
633        // sign extend??
634        __ movl(rax, Address(rsp, ld_off));
635        __ movptr(Address(rsp, st_off), rax);
636
637      } else {
638
639        __ movq(rax, Address(rsp, ld_off));
640
641        // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
642        // T_DOUBLE and T_LONG use two slots in the interpreter
643        if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
644          // ld_off == LSW, ld_off+wordSize == MSW
645          // st_off == MSW, next_off == LSW
646          __ movq(Address(rsp, next_off), rax);
647#ifdef ASSERT
648          // Overwrite the unused slot with known junk
649          __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
650          __ movptr(Address(rsp, st_off), rax);
651#endif /* ASSERT */
652        } else {
653          __ movq(Address(rsp, st_off), rax);
654        }
655      }
656    } else if (r_1->is_Register()) {
657      Register r = r_1->as_Register();
658      if (!r_2->is_valid()) {
659        // must be only an int (or less ) so move only 32bits to slot
660        // why not sign extend??
661        __ movl(Address(rsp, st_off), r);
662      } else {
663        // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
664        // T_DOUBLE and T_LONG use two slots in the interpreter
665        if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
666          // long/double in gpr
667#ifdef ASSERT
668          // Overwrite the unused slot with known junk
669          __ mov64(rax, CONST64(0xdeadffffdeadaaab));
670          __ movptr(Address(rsp, st_off), rax);
671#endif /* ASSERT */
672          __ movq(Address(rsp, next_off), r);
673        } else {
674          __ movptr(Address(rsp, st_off), r);
675        }
676      }
677    } else {
678      assert(r_1->is_XMMRegister(), "");
679      if (!r_2->is_valid()) {
680        // only a float use just part of the slot
681        __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
682      } else {
683#ifdef ASSERT
684        // Overwrite the unused slot with known junk
685        __ mov64(rax, CONST64(0xdeadffffdeadaaac));
686        __ movptr(Address(rsp, st_off), rax);
687#endif /* ASSERT */
688        __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
689      }
690    }
691  }
692
693  // Schedule the branch target address early.
694  __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
695  __ jmp(rcx);
696}
697
698static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
699                        address code_start, address code_end,
700                        Label& L_ok) {
701  Label L_fail;
702  __ lea(temp_reg, ExternalAddress(code_start));
703  __ cmpptr(pc_reg, temp_reg);
704  __ jcc(Assembler::belowEqual, L_fail);
705  __ lea(temp_reg, ExternalAddress(code_end));
706  __ cmpptr(pc_reg, temp_reg);
707  __ jcc(Assembler::below, L_ok);
708  __ bind(L_fail);
709}
710
711void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
712                                    int total_args_passed,
713                                    int comp_args_on_stack,
714                                    const BasicType *sig_bt,
715                                    const VMRegPair *regs) {
716
717  // Note: r13 contains the senderSP on entry. We must preserve it since
718  // we may do a i2c -> c2i transition if we lose a race where compiled
719  // code goes non-entrant while we get args ready.
720  // In addition we use r13 to locate all the interpreter args as
721  // we must align the stack to 16 bytes on an i2c entry else we
722  // lose alignment we expect in all compiled code and register
723  // save code can segv when fxsave instructions find improperly
724  // aligned stack pointer.
725
726  // Adapters can be frameless because they do not require the caller
727  // to perform additional cleanup work, such as correcting the stack pointer.
728  // An i2c adapter is frameless because the *caller* frame, which is interpreted,
729  // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
730  // even if a callee has modified the stack pointer.
731  // A c2i adapter is frameless because the *callee* frame, which is interpreted,
732  // routinely repairs its caller's stack pointer (from sender_sp, which is set
733  // up via the senderSP register).
734  // In other words, if *either* the caller or callee is interpreted, we can
735  // get the stack pointer repaired after a call.
736  // This is why c2i and i2c adapters cannot be indefinitely composed.
737  // In particular, if a c2i adapter were to somehow call an i2c adapter,
738  // both caller and callee would be compiled methods, and neither would
739  // clean up the stack pointer changes performed by the two adapters.
740  // If this happens, control eventually transfers back to the compiled
741  // caller, but with an uncorrected stack, causing delayed havoc.
742
743  // Pick up the return address
744  __ movptr(rax, Address(rsp, 0));
745
746  if (VerifyAdapterCalls &&
747      (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
748    // So, let's test for cascading c2i/i2c adapters right now.
749    //  assert(Interpreter::contains($return_addr) ||
750    //         StubRoutines::contains($return_addr),
751    //         "i2c adapter must return to an interpreter frame");
752    __ block_comment("verify_i2c { ");
753    Label L_ok;
754    if (Interpreter::code() != NULL)
755      range_check(masm, rax, r11,
756                  Interpreter::code()->code_start(), Interpreter::code()->code_end(),
757                  L_ok);
758    if (StubRoutines::code1() != NULL)
759      range_check(masm, rax, r11,
760                  StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
761                  L_ok);
762    if (StubRoutines::code2() != NULL)
763      range_check(masm, rax, r11,
764                  StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
765                  L_ok);
766    const char* msg = "i2c adapter must return to an interpreter frame";
767    __ block_comment(msg);
768    __ stop(msg);
769    __ bind(L_ok);
770    __ block_comment("} verify_i2ce ");
771  }
772
773  // Must preserve original SP for loading incoming arguments because
774  // we need to align the outgoing SP for compiled code.
775  __ movptr(r11, rsp);
776
777  // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
778  // in registers, we will occasionally have no stack args.
779  int comp_words_on_stack = 0;
780  if (comp_args_on_stack) {
781    // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
782    // registers are below.  By subtracting stack0, we either get a negative
783    // number (all values in registers) or the maximum stack slot accessed.
784
785    // Convert 4-byte c2 stack slots to words.
786    comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
787    // Round up to miminum stack alignment, in wordSize
788    comp_words_on_stack = align_up(comp_words_on_stack, 2);
789    __ subptr(rsp, comp_words_on_stack * wordSize);
790  }
791
792
793  // Ensure compiled code always sees stack at proper alignment
794  __ andptr(rsp, -16);
795
796  // push the return address and misalign the stack that youngest frame always sees
797  // as far as the placement of the call instruction
798  __ push(rax);
799
800  // Put saved SP in another register
801  const Register saved_sp = rax;
802  __ movptr(saved_sp, r11);
803
804  // Will jump to the compiled code just as if compiled code was doing it.
805  // Pre-load the register-jump target early, to schedule it better.
806  __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
807
808#if INCLUDE_JVMCI
809  if (EnableJVMCI || UseAOT) {
810    // check if this call should be routed towards a specific entry point
811    __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
812    Label no_alternative_target;
813    __ jcc(Assembler::equal, no_alternative_target);
814    __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
815    __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
816    __ bind(no_alternative_target);
817  }
818#endif // INCLUDE_JVMCI
819
820  // Now generate the shuffle code.  Pick up all register args and move the
821  // rest through the floating point stack top.
822  for (int i = 0; i < total_args_passed; i++) {
823    if (sig_bt[i] == T_VOID) {
824      // Longs and doubles are passed in native word order, but misaligned
825      // in the 32-bit build.
826      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
827      continue;
828    }
829
830    // Pick up 0, 1 or 2 words from SP+offset.
831
832    assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
833            "scrambled load targets?");
834    // Load in argument order going down.
835    int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
836    // Point to interpreter value (vs. tag)
837    int next_off = ld_off - Interpreter::stackElementSize;
838    //
839    //
840    //
841    VMReg r_1 = regs[i].first();
842    VMReg r_2 = regs[i].second();
843    if (!r_1->is_valid()) {
844      assert(!r_2->is_valid(), "");
845      continue;
846    }
847    if (r_1->is_stack()) {
848      // Convert stack slot to an SP offset (+ wordSize to account for return address )
849      int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
850
851      // We can use r13 as a temp here because compiled code doesn't need r13 as an input
852      // and if we end up going thru a c2i because of a miss a reasonable value of r13
853      // will be generated.
854      if (!r_2->is_valid()) {
855        // sign extend???
856        __ movl(r13, Address(saved_sp, ld_off));
857        __ movptr(Address(rsp, st_off), r13);
858      } else {
859        //
860        // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
861        // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
862        // So we must adjust where to pick up the data to match the interpreter.
863        //
864        // Interpreter local[n] == MSW, local[n+1] == LSW however locals
865        // are accessed as negative so LSW is at LOW address
866
867        // ld_off is MSW so get LSW
868        const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
869                           next_off : ld_off;
870        __ movq(r13, Address(saved_sp, offset));
871        // st_off is LSW (i.e. reg.first())
872        __ movq(Address(rsp, st_off), r13);
873      }
874    } else if (r_1->is_Register()) {  // Register argument
875      Register r = r_1->as_Register();
876      assert(r != rax, "must be different");
877      if (r_2->is_valid()) {
878        //
879        // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
880        // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
881        // So we must adjust where to pick up the data to match the interpreter.
882
883        const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
884                           next_off : ld_off;
885
886        // this can be a misaligned move
887        __ movq(r, Address(saved_sp, offset));
888      } else {
889        // sign extend and use a full word?
890        __ movl(r, Address(saved_sp, ld_off));
891      }
892    } else {
893      if (!r_2->is_valid()) {
894        __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
895      } else {
896        __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
897      }
898    }
899  }
900
901  // 6243940 We might end up in handle_wrong_method if
902  // the callee is deoptimized as we race thru here. If that
903  // happens we don't want to take a safepoint because the
904  // caller frame will look interpreted and arguments are now
905  // "compiled" so it is much better to make this transition
906  // invisible to the stack walking code. Unfortunately if
907  // we try and find the callee by normal means a safepoint
908  // is possible. So we stash the desired callee in the thread
909  // and the vm will find there should this case occur.
910
911  __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
912
913  // put Method* where a c2i would expect should we end up there
914  // only needed becaus eof c2 resolve stubs return Method* as a result in
915  // rax
916  __ mov(rax, rbx);
917  __ jmp(r11);
918}
919
920// ---------------------------------------------------------------
921AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
922                                                            int total_args_passed,
923                                                            int comp_args_on_stack,
924                                                            const BasicType *sig_bt,
925                                                            const VMRegPair *regs,
926                                                            AdapterFingerPrint* fingerprint) {
927  address i2c_entry = __ pc();
928
929  gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
930
931  // -------------------------------------------------------------------------
932  // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
933  // to the interpreter.  The args start out packed in the compiled layout.  They
934  // need to be unpacked into the interpreter layout.  This will almost always
935  // require some stack space.  We grow the current (compiled) stack, then repack
936  // the args.  We  finally end in a jump to the generic interpreter entry point.
937  // On exit from the interpreter, the interpreter will restore our SP (lest the
938  // compiled code, which relys solely on SP and not RBP, get sick).
939
940  address c2i_unverified_entry = __ pc();
941  Label skip_fixup;
942  Label ok;
943
944  Register holder = rax;
945  Register receiver = j_rarg0;
946  Register temp = rbx;
947
948  {
949    __ load_klass(temp, receiver);
950    __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
951    __ movptr(rbx, Address(holder, CompiledICHolder::holder_method_offset()));
952    __ jcc(Assembler::equal, ok);
953    __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
954
955    __ bind(ok);
956    // Method might have been compiled since the call site was patched to
957    // interpreted if that is the case treat it as a miss so we can get
958    // the call site corrected.
959    __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
960    __ jcc(Assembler::equal, skip_fixup);
961    __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
962  }
963
964  address c2i_entry = __ pc();
965
966  gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
967
968  __ flush();
969  return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry);
970}
971
972int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
973                                         VMRegPair *regs,
974                                         VMRegPair *regs2,
975                                         int total_args_passed) {
976  assert(regs2 == NULL, "not needed on x86");
977// We return the amount of VMRegImpl stack slots we need to reserve for all
978// the arguments NOT counting out_preserve_stack_slots.
979
980// NOTE: These arrays will have to change when c1 is ported
981#ifdef _WIN64
982    static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
983      c_rarg0, c_rarg1, c_rarg2, c_rarg3
984    };
985    static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
986      c_farg0, c_farg1, c_farg2, c_farg3
987    };
988#else
989    static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
990      c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
991    };
992    static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
993      c_farg0, c_farg1, c_farg2, c_farg3,
994      c_farg4, c_farg5, c_farg6, c_farg7
995    };
996#endif // _WIN64
997
998
999    uint int_args = 0;
1000    uint fp_args = 0;
1001    uint stk_args = 0; // inc by 2 each time
1002
1003    for (int i = 0; i < total_args_passed; i++) {
1004      switch (sig_bt[i]) {
1005      case T_BOOLEAN:
1006      case T_CHAR:
1007      case T_BYTE:
1008      case T_SHORT:
1009      case T_INT:
1010        if (int_args < Argument::n_int_register_parameters_c) {
1011          regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1012#ifdef _WIN64
1013          fp_args++;
1014          // Allocate slots for callee to stuff register args the stack.
1015          stk_args += 2;
1016#endif
1017        } else {
1018          regs[i].set1(VMRegImpl::stack2reg(stk_args));
1019          stk_args += 2;
1020        }
1021        break;
1022      case T_LONG:
1023        assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1024        // fall through
1025      case T_OBJECT:
1026      case T_ARRAY:
1027      case T_ADDRESS:
1028      case T_METADATA:
1029        if (int_args < Argument::n_int_register_parameters_c) {
1030          regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1031#ifdef _WIN64
1032          fp_args++;
1033          stk_args += 2;
1034#endif
1035        } else {
1036          regs[i].set2(VMRegImpl::stack2reg(stk_args));
1037          stk_args += 2;
1038        }
1039        break;
1040      case T_FLOAT:
1041        if (fp_args < Argument::n_float_register_parameters_c) {
1042          regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1043#ifdef _WIN64
1044          int_args++;
1045          // Allocate slots for callee to stuff register args the stack.
1046          stk_args += 2;
1047#endif
1048        } else {
1049          regs[i].set1(VMRegImpl::stack2reg(stk_args));
1050          stk_args += 2;
1051        }
1052        break;
1053      case T_DOUBLE:
1054        assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1055        if (fp_args < Argument::n_float_register_parameters_c) {
1056          regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1057#ifdef _WIN64
1058          int_args++;
1059          // Allocate slots for callee to stuff register args the stack.
1060          stk_args += 2;
1061#endif
1062        } else {
1063          regs[i].set2(VMRegImpl::stack2reg(stk_args));
1064          stk_args += 2;
1065        }
1066        break;
1067      case T_VOID: // Halves of longs and doubles
1068        assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1069        regs[i].set_bad();
1070        break;
1071      default:
1072        ShouldNotReachHere();
1073        break;
1074      }
1075    }
1076#ifdef _WIN64
1077  // windows abi requires that we always allocate enough stack space
1078  // for 4 64bit registers to be stored down.
1079  if (stk_args < 8) {
1080    stk_args = 8;
1081  }
1082#endif // _WIN64
1083
1084  return stk_args;
1085}
1086
1087// On 64 bit we will store integer like items to the stack as
1088// 64 bits items (sparc abi) even though java would only store
1089// 32bits for a parameter. On 32bit it will simply be 32 bits
1090// So this routine will do 32->32 on 32bit and 32->64 on 64bit
1091static void move32_64(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1092  if (src.first()->is_stack()) {
1093    if (dst.first()->is_stack()) {
1094      // stack to stack
1095      __ movslq(rax, Address(rbp, reg2offset_in(src.first())));
1096      __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
1097    } else {
1098      // stack to reg
1099      __ movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1100    }
1101  } else if (dst.first()->is_stack()) {
1102    // reg to stack
1103    // Do we really have to sign extend???
1104    // __ movslq(src.first()->as_Register(), src.first()->as_Register());
1105    __ movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1106  } else {
1107    // Do we really have to sign extend???
1108    // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1109    if (dst.first() != src.first()) {
1110      __ movq(dst.first()->as_Register(), src.first()->as_Register());
1111    }
1112  }
1113}
1114
1115static void move_ptr(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1116  if (src.first()->is_stack()) {
1117    if (dst.first()->is_stack()) {
1118      // stack to stack
1119      __ movq(rax, Address(rbp, reg2offset_in(src.first())));
1120      __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
1121    } else {
1122      // stack to reg
1123      __ movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1124    }
1125  } else if (dst.first()->is_stack()) {
1126    // reg to stack
1127    __ movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1128  } else {
1129    if (dst.first() != src.first()) {
1130      __ movq(dst.first()->as_Register(), src.first()->as_Register());
1131    }
1132  }
1133}
1134
1135// An oop arg. Must pass a handle not the oop itself
1136static void object_move(MacroAssembler* masm,
1137                        OopMap* map,
1138                        int oop_handle_offset,
1139                        int framesize_in_slots,
1140                        VMRegPair src,
1141                        VMRegPair dst,
1142                        bool is_receiver,
1143                        int* receiver_offset) {
1144
1145  // must pass a handle. First figure out the location we use as a handle
1146
1147  Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1148
1149  // See if oop is NULL if it is we need no handle
1150
1151  if (src.first()->is_stack()) {
1152
1153    // Oop is already on the stack as an argument
1154    int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1155    map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1156    if (is_receiver) {
1157      *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1158    }
1159
1160    __ cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
1161    __ lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1162    // conditionally move a NULL
1163    __ cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1164  } else {
1165
1166    // Oop is in an a register we must store it to the space we reserve
1167    // on the stack for oop_handles and pass a handle if oop is non-NULL
1168
1169    const Register rOop = src.first()->as_Register();
1170    int oop_slot;
1171    if (rOop == j_rarg0)
1172      oop_slot = 0;
1173    else if (rOop == j_rarg1)
1174      oop_slot = 1;
1175    else if (rOop == j_rarg2)
1176      oop_slot = 2;
1177    else if (rOop == j_rarg3)
1178      oop_slot = 3;
1179    else if (rOop == j_rarg4)
1180      oop_slot = 4;
1181    else {
1182      assert(rOop == j_rarg5, "wrong register");
1183      oop_slot = 5;
1184    }
1185
1186    oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1187    int offset = oop_slot*VMRegImpl::stack_slot_size;
1188
1189    map->set_oop(VMRegImpl::stack2reg(oop_slot));
1190    // Store oop in handle area, may be NULL
1191    __ movptr(Address(rsp, offset), rOop);
1192    if (is_receiver) {
1193      *receiver_offset = offset;
1194    }
1195
1196    __ cmpptr(rOop, (int32_t)NULL_WORD);
1197    __ lea(rHandle, Address(rsp, offset));
1198    // conditionally move a NULL from the handle area where it was just stored
1199    __ cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1200  }
1201
1202  // If arg is on the stack then place it otherwise it is already in correct reg.
1203  if (dst.first()->is_stack()) {
1204    __ movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1205  }
1206}
1207
1208// A float arg may have to do float reg int reg conversion
1209static void float_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1210  assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
1211
1212  // The calling conventions assures us that each VMregpair is either
1213  // all really one physical register or adjacent stack slots.
1214  // This greatly simplifies the cases here compared to sparc.
1215
1216  if (src.first()->is_stack()) {
1217    if (dst.first()->is_stack()) {
1218      __ movl(rax, Address(rbp, reg2offset_in(src.first())));
1219      __ movptr(Address(rsp, reg2offset_out(dst.first())), rax);
1220    } else {
1221      // stack to reg
1222      assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1223      __ movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first())));
1224    }
1225  } else if (dst.first()->is_stack()) {
1226    // reg to stack
1227    assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1228    __ movflt(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1229  } else {
1230    // reg to reg
1231    // In theory these overlap but the ordering is such that this is likely a nop
1232    if ( src.first() != dst.first()) {
1233      __ movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
1234    }
1235  }
1236}
1237
1238// A long move
1239static void long_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1240
1241  // The calling conventions assures us that each VMregpair is either
1242  // all really one physical register or adjacent stack slots.
1243  // This greatly simplifies the cases here compared to sparc.
1244
1245  if (src.is_single_phys_reg() ) {
1246    if (dst.is_single_phys_reg()) {
1247      if (dst.first() != src.first()) {
1248        __ mov(dst.first()->as_Register(), src.first()->as_Register());
1249      }
1250    } else {
1251      assert(dst.is_single_reg(), "not a stack pair");
1252      __ movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1253    }
1254  } else if (dst.is_single_phys_reg()) {
1255    assert(src.is_single_reg(),  "not a stack pair");
1256    __ movq(dst.first()->as_Register(), Address(rbp, reg2offset_out(src.first())));
1257  } else {
1258    assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
1259    __ movq(rax, Address(rbp, reg2offset_in(src.first())));
1260    __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
1261  }
1262}
1263
1264// A double move
1265static void double_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1266
1267  // The calling conventions assures us that each VMregpair is either
1268  // all really one physical register or adjacent stack slots.
1269  // This greatly simplifies the cases here compared to sparc.
1270
1271  if (src.is_single_phys_reg() ) {
1272    if (dst.is_single_phys_reg()) {
1273      // In theory these overlap but the ordering is such that this is likely a nop
1274      if ( src.first() != dst.first()) {
1275        __ movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
1276      }
1277    } else {
1278      assert(dst.is_single_reg(), "not a stack pair");
1279      __ movdbl(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1280    }
1281  } else if (dst.is_single_phys_reg()) {
1282    assert(src.is_single_reg(),  "not a stack pair");
1283    __ movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_out(src.first())));
1284  } else {
1285    assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
1286    __ movq(rax, Address(rbp, reg2offset_in(src.first())));
1287    __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
1288  }
1289}
1290
1291
1292void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1293  // We always ignore the frame_slots arg and just use the space just below frame pointer
1294  // which by this time is free to use
1295  switch (ret_type) {
1296  case T_FLOAT:
1297    __ movflt(Address(rbp, -wordSize), xmm0);
1298    break;
1299  case T_DOUBLE:
1300    __ movdbl(Address(rbp, -wordSize), xmm0);
1301    break;
1302  case T_VOID:  break;
1303  default: {
1304    __ movptr(Address(rbp, -wordSize), rax);
1305    }
1306  }
1307}
1308
1309void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1310  // We always ignore the frame_slots arg and just use the space just below frame pointer
1311  // which by this time is free to use
1312  switch (ret_type) {
1313  case T_FLOAT:
1314    __ movflt(xmm0, Address(rbp, -wordSize));
1315    break;
1316  case T_DOUBLE:
1317    __ movdbl(xmm0, Address(rbp, -wordSize));
1318    break;
1319  case T_VOID:  break;
1320  default: {
1321    __ movptr(rax, Address(rbp, -wordSize));
1322    }
1323  }
1324}
1325
1326static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1327    for ( int i = first_arg ; i < arg_count ; i++ ) {
1328      if (args[i].first()->is_Register()) {
1329        __ push(args[i].first()->as_Register());
1330      } else if (args[i].first()->is_XMMRegister()) {
1331        __ subptr(rsp, 2*wordSize);
1332        __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1333      }
1334    }
1335}
1336
1337static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1338    for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1339      if (args[i].first()->is_Register()) {
1340        __ pop(args[i].first()->as_Register());
1341      } else if (args[i].first()->is_XMMRegister()) {
1342        __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1343        __ addptr(rsp, 2*wordSize);
1344      }
1345    }
1346}
1347
1348
1349static void save_or_restore_arguments(MacroAssembler* masm,
1350                                      const int stack_slots,
1351                                      const int total_in_args,
1352                                      const int arg_save_area,
1353                                      OopMap* map,
1354                                      VMRegPair* in_regs,
1355                                      BasicType* in_sig_bt) {
1356  // if map is non-NULL then the code should store the values,
1357  // otherwise it should load them.
1358  int slot = arg_save_area;
1359  // Save down double word first
1360  for ( int i = 0; i < total_in_args; i++) {
1361    if (in_regs[i].first()->is_XMMRegister() && in_sig_bt[i] == T_DOUBLE) {
1362      int offset = slot * VMRegImpl::stack_slot_size;
1363      slot += VMRegImpl::slots_per_word;
1364      assert(slot <= stack_slots, "overflow");
1365      if (map != NULL) {
1366        __ movdbl(Address(rsp, offset), in_regs[i].first()->as_XMMRegister());
1367      } else {
1368        __ movdbl(in_regs[i].first()->as_XMMRegister(), Address(rsp, offset));
1369      }
1370    }
1371    if (in_regs[i].first()->is_Register() &&
1372        (in_sig_bt[i] == T_LONG || in_sig_bt[i] == T_ARRAY)) {
1373      int offset = slot * VMRegImpl::stack_slot_size;
1374      if (map != NULL) {
1375        __ movq(Address(rsp, offset), in_regs[i].first()->as_Register());
1376        if (in_sig_bt[i] == T_ARRAY) {
1377          map->set_oop(VMRegImpl::stack2reg(slot));;
1378        }
1379      } else {
1380        __ movq(in_regs[i].first()->as_Register(), Address(rsp, offset));
1381      }
1382      slot += VMRegImpl::slots_per_word;
1383    }
1384  }
1385  // Save or restore single word registers
1386  for ( int i = 0; i < total_in_args; i++) {
1387    if (in_regs[i].first()->is_Register()) {
1388      int offset = slot * VMRegImpl::stack_slot_size;
1389      slot++;
1390      assert(slot <= stack_slots, "overflow");
1391
1392      // Value is in an input register pass we must flush it to the stack
1393      const Register reg = in_regs[i].first()->as_Register();
1394      switch (in_sig_bt[i]) {
1395        case T_BOOLEAN:
1396        case T_CHAR:
1397        case T_BYTE:
1398        case T_SHORT:
1399        case T_INT:
1400          if (map != NULL) {
1401            __ movl(Address(rsp, offset), reg);
1402          } else {
1403            __ movl(reg, Address(rsp, offset));
1404          }
1405          break;
1406        case T_ARRAY:
1407        case T_LONG:
1408          // handled above
1409          break;
1410        case T_OBJECT:
1411        default: ShouldNotReachHere();
1412      }
1413    } else if (in_regs[i].first()->is_XMMRegister()) {
1414      if (in_sig_bt[i] == T_FLOAT) {
1415        int offset = slot * VMRegImpl::stack_slot_size;
1416        slot++;
1417        assert(slot <= stack_slots, "overflow");
1418        if (map != NULL) {
1419          __ movflt(Address(rsp, offset), in_regs[i].first()->as_XMMRegister());
1420        } else {
1421          __ movflt(in_regs[i].first()->as_XMMRegister(), Address(rsp, offset));
1422        }
1423      }
1424    } else if (in_regs[i].first()->is_stack()) {
1425      if (in_sig_bt[i] == T_ARRAY && map != NULL) {
1426        int offset_in_older_frame = in_regs[i].first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1427        map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + stack_slots));
1428      }
1429    }
1430  }
1431}
1432
1433
1434// Check GCLocker::needs_gc and enter the runtime if it's true.  This
1435// keeps a new JNI critical region from starting until a GC has been
1436// forced.  Save down any oops in registers and describe them in an
1437// OopMap.
1438static void check_needs_gc_for_critical_native(MacroAssembler* masm,
1439                                               int stack_slots,
1440                                               int total_c_args,
1441                                               int total_in_args,
1442                                               int arg_save_area,
1443                                               OopMapSet* oop_maps,
1444                                               VMRegPair* in_regs,
1445                                               BasicType* in_sig_bt) {
1446  __ block_comment("check GCLocker::needs_gc");
1447  Label cont;
1448  __ cmp8(ExternalAddress((address)GCLocker::needs_gc_address()), false);
1449  __ jcc(Assembler::equal, cont);
1450
1451  // Save down any incoming oops and call into the runtime to halt for a GC
1452
1453  OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1454  save_or_restore_arguments(masm, stack_slots, total_in_args,
1455                            arg_save_area, map, in_regs, in_sig_bt);
1456
1457  address the_pc = __ pc();
1458  oop_maps->add_gc_map( __ offset(), map);
1459  __ set_last_Java_frame(rsp, noreg, the_pc);
1460
1461  __ block_comment("block_for_jni_critical");
1462  __ movptr(c_rarg0, r15_thread);
1463  __ mov(r12, rsp); // remember sp
1464  __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1465  __ andptr(rsp, -16); // align stack as required by ABI
1466  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::block_for_jni_critical)));
1467  __ mov(rsp, r12); // restore sp
1468  __ reinit_heapbase();
1469
1470  __ reset_last_Java_frame(false);
1471
1472  save_or_restore_arguments(masm, stack_slots, total_in_args,
1473                            arg_save_area, NULL, in_regs, in_sig_bt);
1474  __ bind(cont);
1475#ifdef ASSERT
1476  if (StressCriticalJNINatives) {
1477    // Stress register saving
1478    OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1479    save_or_restore_arguments(masm, stack_slots, total_in_args,
1480                              arg_save_area, map, in_regs, in_sig_bt);
1481    // Destroy argument registers
1482    for (int i = 0; i < total_in_args - 1; i++) {
1483      if (in_regs[i].first()->is_Register()) {
1484        const Register reg = in_regs[i].first()->as_Register();
1485        __ xorptr(reg, reg);
1486      } else if (in_regs[i].first()->is_XMMRegister()) {
1487        __ xorpd(in_regs[i].first()->as_XMMRegister(), in_regs[i].first()->as_XMMRegister());
1488      } else if (in_regs[i].first()->is_FloatRegister()) {
1489        ShouldNotReachHere();
1490      } else if (in_regs[i].first()->is_stack()) {
1491        // Nothing to do
1492      } else {
1493        ShouldNotReachHere();
1494      }
1495      if (in_sig_bt[i] == T_LONG || in_sig_bt[i] == T_DOUBLE) {
1496        i++;
1497      }
1498    }
1499
1500    save_or_restore_arguments(masm, stack_slots, total_in_args,
1501                              arg_save_area, NULL, in_regs, in_sig_bt);
1502  }
1503#endif
1504}
1505
1506// Unpack an array argument into a pointer to the body and the length
1507// if the array is non-null, otherwise pass 0 for both.
1508static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) {
1509  Register tmp_reg = rax;
1510  assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg,
1511         "possible collision");
1512  assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg,
1513         "possible collision");
1514
1515  __ block_comment("unpack_array_argument {");
1516
1517  // Pass the length, ptr pair
1518  Label is_null, done;
1519  VMRegPair tmp;
1520  tmp.set_ptr(tmp_reg->as_VMReg());
1521  if (reg.first()->is_stack()) {
1522    // Load the arg up from the stack
1523    move_ptr(masm, reg, tmp);
1524    reg = tmp;
1525  }
1526  __ testptr(reg.first()->as_Register(), reg.first()->as_Register());
1527  __ jccb(Assembler::equal, is_null);
1528  __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1529  move_ptr(masm, tmp, body_arg);
1530  // load the length relative to the body.
1531  __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() -
1532                           arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1533  move32_64(masm, tmp, length_arg);
1534  __ jmpb(done);
1535  __ bind(is_null);
1536  // Pass zeros
1537  __ xorptr(tmp_reg, tmp_reg);
1538  move_ptr(masm, tmp, body_arg);
1539  move32_64(masm, tmp, length_arg);
1540  __ bind(done);
1541
1542  __ block_comment("} unpack_array_argument");
1543}
1544
1545
1546// Different signatures may require very different orders for the move
1547// to avoid clobbering other arguments.  There's no simple way to
1548// order them safely.  Compute a safe order for issuing stores and
1549// break any cycles in those stores.  This code is fairly general but
1550// it's not necessary on the other platforms so we keep it in the
1551// platform dependent code instead of moving it into a shared file.
1552// (See bugs 7013347 & 7145024.)
1553// Note that this code is specific to LP64.
1554class ComputeMoveOrder: public StackObj {
1555  class MoveOperation: public ResourceObj {
1556    friend class ComputeMoveOrder;
1557   private:
1558    VMRegPair        _src;
1559    VMRegPair        _dst;
1560    int              _src_index;
1561    int              _dst_index;
1562    bool             _processed;
1563    MoveOperation*  _next;
1564    MoveOperation*  _prev;
1565
1566    static int get_id(VMRegPair r) {
1567      return r.first()->value();
1568    }
1569
1570   public:
1571    MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1572      _src(src)
1573    , _src_index(src_index)
1574    , _dst(dst)
1575    , _dst_index(dst_index)
1576    , _next(NULL)
1577    , _prev(NULL)
1578    , _processed(false) {
1579    }
1580
1581    VMRegPair src() const              { return _src; }
1582    int src_id() const                 { return get_id(src()); }
1583    int src_index() const              { return _src_index; }
1584    VMRegPair dst() const              { return _dst; }
1585    void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1586    int dst_index() const              { return _dst_index; }
1587    int dst_id() const                 { return get_id(dst()); }
1588    MoveOperation* next() const       { return _next; }
1589    MoveOperation* prev() const       { return _prev; }
1590    void set_processed()               { _processed = true; }
1591    bool is_processed() const          { return _processed; }
1592
1593    // insert
1594    void break_cycle(VMRegPair temp_register) {
1595      // create a new store following the last store
1596      // to move from the temp_register to the original
1597      MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1598
1599      // break the cycle of links and insert new_store at the end
1600      // break the reverse link.
1601      MoveOperation* p = prev();
1602      assert(p->next() == this, "must be");
1603      _prev = NULL;
1604      p->_next = new_store;
1605      new_store->_prev = p;
1606
1607      // change the original store to save it's value in the temp.
1608      set_dst(-1, temp_register);
1609    }
1610
1611    void link(GrowableArray<MoveOperation*>& killer) {
1612      // link this store in front the store that it depends on
1613      MoveOperation* n = killer.at_grow(src_id(), NULL);
1614      if (n != NULL) {
1615        assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1616        _next = n;
1617        n->_prev = this;
1618      }
1619    }
1620  };
1621
1622 private:
1623  GrowableArray<MoveOperation*> edges;
1624
1625 public:
1626  ComputeMoveOrder(int total_in_args, VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1627                    BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1628    // Move operations where the dest is the stack can all be
1629    // scheduled first since they can't interfere with the other moves.
1630    for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1631      if (in_sig_bt[i] == T_ARRAY) {
1632        c_arg--;
1633        if (out_regs[c_arg].first()->is_stack() &&
1634            out_regs[c_arg + 1].first()->is_stack()) {
1635          arg_order.push(i);
1636          arg_order.push(c_arg);
1637        } else {
1638          if (out_regs[c_arg].first()->is_stack() ||
1639              in_regs[i].first() == out_regs[c_arg].first()) {
1640            add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1641          } else {
1642            add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1643          }
1644        }
1645      } else if (in_sig_bt[i] == T_VOID) {
1646        arg_order.push(i);
1647        arg_order.push(c_arg);
1648      } else {
1649        if (out_regs[c_arg].first()->is_stack() ||
1650            in_regs[i].first() == out_regs[c_arg].first()) {
1651          arg_order.push(i);
1652          arg_order.push(c_arg);
1653        } else {
1654          add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1655        }
1656      }
1657    }
1658    // Break any cycles in the register moves and emit the in the
1659    // proper order.
1660    GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1661    for (int i = 0; i < stores->length(); i++) {
1662      arg_order.push(stores->at(i)->src_index());
1663      arg_order.push(stores->at(i)->dst_index());
1664    }
1665 }
1666
1667  // Collected all the move operations
1668  void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1669    if (src.first() == dst.first()) return;
1670    edges.append(new MoveOperation(src_index, src, dst_index, dst));
1671  }
1672
1673  // Walk the edges breaking cycles between moves.  The result list
1674  // can be walked in order to produce the proper set of loads
1675  GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1676    // Record which moves kill which values
1677    GrowableArray<MoveOperation*> killer;
1678    for (int i = 0; i < edges.length(); i++) {
1679      MoveOperation* s = edges.at(i);
1680      assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1681      killer.at_put_grow(s->dst_id(), s, NULL);
1682    }
1683    assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1684           "make sure temp isn't in the registers that are killed");
1685
1686    // create links between loads and stores
1687    for (int i = 0; i < edges.length(); i++) {
1688      edges.at(i)->link(killer);
1689    }
1690
1691    // at this point, all the move operations are chained together
1692    // in a doubly linked list.  Processing it backwards finds
1693    // the beginning of the chain, forwards finds the end.  If there's
1694    // a cycle it can be broken at any point,  so pick an edge and walk
1695    // backward until the list ends or we end where we started.
1696    GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1697    for (int e = 0; e < edges.length(); e++) {
1698      MoveOperation* s = edges.at(e);
1699      if (!s->is_processed()) {
1700        MoveOperation* start = s;
1701        // search for the beginning of the chain or cycle
1702        while (start->prev() != NULL && start->prev() != s) {
1703          start = start->prev();
1704        }
1705        if (start->prev() == s) {
1706          start->break_cycle(temp_register);
1707        }
1708        // walk the chain forward inserting to store list
1709        while (start != NULL) {
1710          stores->append(start);
1711          start->set_processed();
1712          start = start->next();
1713        }
1714      }
1715    }
1716    return stores;
1717  }
1718};
1719
1720static void verify_oop_args(MacroAssembler* masm,
1721                            const methodHandle& method,
1722                            const BasicType* sig_bt,
1723                            const VMRegPair* regs) {
1724  Register temp_reg = rbx;  // not part of any compiled calling seq
1725  if (VerifyOops) {
1726    for (int i = 0; i < method->size_of_parameters(); i++) {
1727      if (sig_bt[i] == T_OBJECT ||
1728          sig_bt[i] == T_ARRAY) {
1729        VMReg r = regs[i].first();
1730        assert(r->is_valid(), "bad oop arg");
1731        if (r->is_stack()) {
1732          __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1733          __ verify_oop(temp_reg);
1734        } else {
1735          __ verify_oop(r->as_Register());
1736        }
1737      }
1738    }
1739  }
1740}
1741
1742static void gen_special_dispatch(MacroAssembler* masm,
1743                                 const methodHandle& method,
1744                                 const BasicType* sig_bt,
1745                                 const VMRegPair* regs) {
1746  verify_oop_args(masm, method, sig_bt, regs);
1747  vmIntrinsics::ID iid = method->intrinsic_id();
1748
1749  // Now write the args into the outgoing interpreter space
1750  bool     has_receiver   = false;
1751  Register receiver_reg   = noreg;
1752  int      member_arg_pos = -1;
1753  Register member_reg     = noreg;
1754  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1755  if (ref_kind != 0) {
1756    member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1757    member_reg = rbx;  // known to be free at this point
1758    has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1759  } else if (iid == vmIntrinsics::_invokeBasic) {
1760    has_receiver = true;
1761  } else {
1762    fatal("unexpected intrinsic id %d", iid);
1763  }
1764
1765  if (member_reg != noreg) {
1766    // Load the member_arg into register, if necessary.
1767    SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1768    VMReg r = regs[member_arg_pos].first();
1769    if (r->is_stack()) {
1770      __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1771    } else {
1772      // no data motion is needed
1773      member_reg = r->as_Register();
1774    }
1775  }
1776
1777  if (has_receiver) {
1778    // Make sure the receiver is loaded into a register.
1779    assert(method->size_of_parameters() > 0, "oob");
1780    assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1781    VMReg r = regs[0].first();
1782    assert(r->is_valid(), "bad receiver arg");
1783    if (r->is_stack()) {
1784      // Porting note:  This assumes that compiled calling conventions always
1785      // pass the receiver oop in a register.  If this is not true on some
1786      // platform, pick a temp and load the receiver from stack.
1787      fatal("receiver always in a register");
1788      receiver_reg = j_rarg0;  // known to be free at this point
1789      __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1790    } else {
1791      // no data motion is needed
1792      receiver_reg = r->as_Register();
1793    }
1794  }
1795
1796  // Figure out which address we are really jumping to:
1797  MethodHandles::generate_method_handle_dispatch(masm, iid,
1798                                                 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1799}
1800
1801// ---------------------------------------------------------------------------
1802// Generate a native wrapper for a given method.  The method takes arguments
1803// in the Java compiled code convention, marshals them to the native
1804// convention (handlizes oops, etc), transitions to native, makes the call,
1805// returns to java state (possibly blocking), unhandlizes any result and
1806// returns.
1807//
1808// Critical native functions are a shorthand for the use of
1809// GetPrimtiveArrayCritical and disallow the use of any other JNI
1810// functions.  The wrapper is expected to unpack the arguments before
1811// passing them to the callee and perform checks before and after the
1812// native call to ensure that they GCLocker
1813// lock_critical/unlock_critical semantics are followed.  Some other
1814// parts of JNI setup are skipped like the tear down of the JNI handle
1815// block and the check for pending exceptions it's impossible for them
1816// to be thrown.
1817//
1818// They are roughly structured like this:
1819//    if (GCLocker::needs_gc())
1820//      SharedRuntime::block_for_jni_critical();
1821//    tranistion to thread_in_native
1822//    unpack arrray arguments and call native entry point
1823//    check for safepoint in progress
1824//    check if any thread suspend flags are set
1825//      call into JVM and possible unlock the JNI critical
1826//      if a GC was suppressed while in the critical native.
1827//    transition back to thread_in_Java
1828//    return to caller
1829//
1830nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1831                                                const methodHandle& method,
1832                                                int compile_id,
1833                                                BasicType* in_sig_bt,
1834                                                VMRegPair* in_regs,
1835                                                BasicType ret_type) {
1836  if (method->is_method_handle_intrinsic()) {
1837    vmIntrinsics::ID iid = method->intrinsic_id();
1838    intptr_t start = (intptr_t)__ pc();
1839    int vep_offset = ((intptr_t)__ pc()) - start;
1840    gen_special_dispatch(masm,
1841                         method,
1842                         in_sig_bt,
1843                         in_regs);
1844    int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1845    __ flush();
1846    int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1847    return nmethod::new_native_nmethod(method,
1848                                       compile_id,
1849                                       masm->code(),
1850                                       vep_offset,
1851                                       frame_complete,
1852                                       stack_slots / VMRegImpl::slots_per_word,
1853                                       in_ByteSize(-1),
1854                                       in_ByteSize(-1),
1855                                       (OopMapSet*)NULL);
1856  }
1857  bool is_critical_native = true;
1858  address native_func = method->critical_native_function();
1859  if (native_func == NULL) {
1860    native_func = method->native_function();
1861    is_critical_native = false;
1862  }
1863  assert(native_func != NULL, "must have function");
1864
1865  // An OopMap for lock (and class if static)
1866  OopMapSet *oop_maps = new OopMapSet();
1867  intptr_t start = (intptr_t)__ pc();
1868
1869  // We have received a description of where all the java arg are located
1870  // on entry to the wrapper. We need to convert these args to where
1871  // the jni function will expect them. To figure out where they go
1872  // we convert the java signature to a C signature by inserting
1873  // the hidden arguments as arg[0] and possibly arg[1] (static method)
1874
1875  const int total_in_args = method->size_of_parameters();
1876  int total_c_args = total_in_args;
1877  if (!is_critical_native) {
1878    total_c_args += 1;
1879    if (method->is_static()) {
1880      total_c_args++;
1881    }
1882  } else {
1883    for (int i = 0; i < total_in_args; i++) {
1884      if (in_sig_bt[i] == T_ARRAY) {
1885        total_c_args++;
1886      }
1887    }
1888  }
1889
1890  BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1891  VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1892  BasicType* in_elem_bt = NULL;
1893
1894  int argc = 0;
1895  if (!is_critical_native) {
1896    out_sig_bt[argc++] = T_ADDRESS;
1897    if (method->is_static()) {
1898      out_sig_bt[argc++] = T_OBJECT;
1899    }
1900
1901    for (int i = 0; i < total_in_args ; i++ ) {
1902      out_sig_bt[argc++] = in_sig_bt[i];
1903    }
1904  } else {
1905    Thread* THREAD = Thread::current();
1906    in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
1907    SignatureStream ss(method->signature());
1908    for (int i = 0; i < total_in_args ; i++ ) {
1909      if (in_sig_bt[i] == T_ARRAY) {
1910        // Arrays are passed as int, elem* pair
1911        out_sig_bt[argc++] = T_INT;
1912        out_sig_bt[argc++] = T_ADDRESS;
1913        Symbol* atype = ss.as_symbol(CHECK_NULL);
1914        const char* at = atype->as_C_string();
1915        if (strlen(at) == 2) {
1916          assert(at[0] == '[', "must be");
1917          switch (at[1]) {
1918            case 'B': in_elem_bt[i]  = T_BYTE; break;
1919            case 'C': in_elem_bt[i]  = T_CHAR; break;
1920            case 'D': in_elem_bt[i]  = T_DOUBLE; break;
1921            case 'F': in_elem_bt[i]  = T_FLOAT; break;
1922            case 'I': in_elem_bt[i]  = T_INT; break;
1923            case 'J': in_elem_bt[i]  = T_LONG; break;
1924            case 'S': in_elem_bt[i]  = T_SHORT; break;
1925            case 'Z': in_elem_bt[i]  = T_BOOLEAN; break;
1926            default: ShouldNotReachHere();
1927          }
1928        }
1929      } else {
1930        out_sig_bt[argc++] = in_sig_bt[i];
1931        in_elem_bt[i] = T_VOID;
1932      }
1933      if (in_sig_bt[i] != T_VOID) {
1934        assert(in_sig_bt[i] == ss.type(), "must match");
1935        ss.next();
1936      }
1937    }
1938  }
1939
1940  // Now figure out where the args must be stored and how much stack space
1941  // they require.
1942  int out_arg_slots;
1943  out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1944
1945  // Compute framesize for the wrapper.  We need to handlize all oops in
1946  // incoming registers
1947
1948  // Calculate the total number of stack slots we will need.
1949
1950  // First count the abi requirement plus all of the outgoing args
1951  int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1952
1953  // Now the space for the inbound oop handle area
1954  int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1955  if (is_critical_native) {
1956    // Critical natives may have to call out so they need a save area
1957    // for register arguments.
1958    int double_slots = 0;
1959    int single_slots = 0;
1960    for ( int i = 0; i < total_in_args; i++) {
1961      if (in_regs[i].first()->is_Register()) {
1962        const Register reg = in_regs[i].first()->as_Register();
1963        switch (in_sig_bt[i]) {
1964          case T_BOOLEAN:
1965          case T_BYTE:
1966          case T_SHORT:
1967          case T_CHAR:
1968          case T_INT:  single_slots++; break;
1969          case T_ARRAY:  // specific to LP64 (7145024)
1970          case T_LONG: double_slots++; break;
1971          default:  ShouldNotReachHere();
1972        }
1973      } else if (in_regs[i].first()->is_XMMRegister()) {
1974        switch (in_sig_bt[i]) {
1975          case T_FLOAT:  single_slots++; break;
1976          case T_DOUBLE: double_slots++; break;
1977          default:  ShouldNotReachHere();
1978        }
1979      } else if (in_regs[i].first()->is_FloatRegister()) {
1980        ShouldNotReachHere();
1981      }
1982    }
1983    total_save_slots = double_slots * 2 + single_slots;
1984    // align the save area
1985    if (double_slots != 0) {
1986      stack_slots = align_up(stack_slots, 2);
1987    }
1988  }
1989
1990  int oop_handle_offset = stack_slots;
1991  stack_slots += total_save_slots;
1992
1993  // Now any space we need for handlizing a klass if static method
1994
1995  int klass_slot_offset = 0;
1996  int klass_offset = -1;
1997  int lock_slot_offset = 0;
1998  bool is_static = false;
1999
2000  if (method->is_static()) {
2001    klass_slot_offset = stack_slots;
2002    stack_slots += VMRegImpl::slots_per_word;
2003    klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2004    is_static = true;
2005  }
2006
2007  // Plus a lock if needed
2008
2009  if (method->is_synchronized()) {
2010    lock_slot_offset = stack_slots;
2011    stack_slots += VMRegImpl::slots_per_word;
2012  }
2013
2014  // Now a place (+2) to save return values or temp during shuffling
2015  // + 4 for return address (which we own) and saved rbp
2016  stack_slots += 6;
2017
2018  // Ok The space we have allocated will look like:
2019  //
2020  //
2021  // FP-> |                     |
2022  //      |---------------------|
2023  //      | 2 slots for moves   |
2024  //      |---------------------|
2025  //      | lock box (if sync)  |
2026  //      |---------------------| <- lock_slot_offset
2027  //      | klass (if static)   |
2028  //      |---------------------| <- klass_slot_offset
2029  //      | oopHandle area      |
2030  //      |---------------------| <- oop_handle_offset (6 java arg registers)
2031  //      | outbound memory     |
2032  //      | based arguments     |
2033  //      |                     |
2034  //      |---------------------|
2035  //      |                     |
2036  // SP-> | out_preserved_slots |
2037  //
2038  //
2039
2040
2041  // Now compute actual number of stack words we need rounding to make
2042  // stack properly aligned.
2043  stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2044
2045  int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2046
2047  // First thing make an ic check to see if we should even be here
2048
2049  // We are free to use all registers as temps without saving them and
2050  // restoring them except rbp. rbp is the only callee save register
2051  // as far as the interpreter and the compiler(s) are concerned.
2052
2053
2054  const Register ic_reg = rax;
2055  const Register receiver = j_rarg0;
2056
2057  Label hit;
2058  Label exception_pending;
2059
2060  assert_different_registers(ic_reg, receiver, rscratch1);
2061  __ verify_oop(receiver);
2062  __ load_klass(rscratch1, receiver);
2063  __ cmpq(ic_reg, rscratch1);
2064  __ jcc(Assembler::equal, hit);
2065
2066  __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
2067
2068  // Verified entry point must be aligned
2069  __ align(8);
2070
2071  __ bind(hit);
2072
2073  int vep_offset = ((intptr_t)__ pc()) - start;
2074
2075#ifdef COMPILER1
2076  // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2077  if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2078    inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2079  }
2080#endif // COMPILER1
2081
2082  // The instruction at the verified entry point must be 5 bytes or longer
2083  // because it can be patched on the fly by make_non_entrant. The stack bang
2084  // instruction fits that requirement.
2085
2086  // Generate stack overflow check
2087
2088  if (UseStackBanging) {
2089    __ bang_stack_with_offset((int)JavaThread::stack_shadow_zone_size());
2090  } else {
2091    // need a 5 byte instruction to allow MT safe patching to non-entrant
2092    __ fat_nop();
2093  }
2094
2095  // Generate a new frame for the wrapper.
2096  __ enter();
2097  // -2 because return address is already present and so is saved rbp
2098  __ subptr(rsp, stack_size - 2*wordSize);
2099
2100  // Frame is now completed as far as size and linkage.
2101  int frame_complete = ((intptr_t)__ pc()) - start;
2102
2103    if (UseRTMLocking) {
2104      // Abort RTM transaction before calling JNI
2105      // because critical section will be large and will be
2106      // aborted anyway. Also nmethod could be deoptimized.
2107      __ xabort(0);
2108    }
2109
2110#ifdef ASSERT
2111    {
2112      Label L;
2113      __ mov(rax, rsp);
2114      __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
2115      __ cmpptr(rax, rsp);
2116      __ jcc(Assembler::equal, L);
2117      __ stop("improperly aligned stack");
2118      __ bind(L);
2119    }
2120#endif /* ASSERT */
2121
2122
2123  // We use r14 as the oop handle for the receiver/klass
2124  // It is callee save so it survives the call to native
2125
2126  const Register oop_handle_reg = r14;
2127
2128  if (is_critical_native) {
2129    check_needs_gc_for_critical_native(masm, stack_slots, total_c_args, total_in_args,
2130                                       oop_handle_offset, oop_maps, in_regs, in_sig_bt);
2131  }
2132
2133  //
2134  // We immediately shuffle the arguments so that any vm call we have to
2135  // make from here on out (sync slow path, jvmti, etc.) we will have
2136  // captured the oops from our caller and have a valid oopMap for
2137  // them.
2138
2139  // -----------------
2140  // The Grand Shuffle
2141
2142  // The Java calling convention is either equal (linux) or denser (win64) than the
2143  // c calling convention. However the because of the jni_env argument the c calling
2144  // convention always has at least one more (and two for static) arguments than Java.
2145  // Therefore if we move the args from java -> c backwards then we will never have
2146  // a register->register conflict and we don't have to build a dependency graph
2147  // and figure out how to break any cycles.
2148  //
2149
2150  // Record esp-based slot for receiver on stack for non-static methods
2151  int receiver_offset = -1;
2152
2153  // This is a trick. We double the stack slots so we can claim
2154  // the oops in the caller's frame. Since we are sure to have
2155  // more args than the caller doubling is enough to make
2156  // sure we can capture all the incoming oop args from the
2157  // caller.
2158  //
2159  OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2160
2161  // Mark location of rbp (someday)
2162  // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2163
2164  // Use eax, ebx as temporaries during any memory-memory moves we have to do
2165  // All inbound args are referenced based on rbp and all outbound args via rsp.
2166
2167
2168#ifdef ASSERT
2169  bool reg_destroyed[RegisterImpl::number_of_registers];
2170  bool freg_destroyed[XMMRegisterImpl::number_of_registers];
2171  for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
2172    reg_destroyed[r] = false;
2173  }
2174  for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
2175    freg_destroyed[f] = false;
2176  }
2177
2178#endif /* ASSERT */
2179
2180  // This may iterate in two different directions depending on the
2181  // kind of native it is.  The reason is that for regular JNI natives
2182  // the incoming and outgoing registers are offset upwards and for
2183  // critical natives they are offset down.
2184  GrowableArray<int> arg_order(2 * total_in_args);
2185  VMRegPair tmp_vmreg;
2186  tmp_vmreg.set1(rbx->as_VMReg());
2187
2188  if (!is_critical_native) {
2189    for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2190      arg_order.push(i);
2191      arg_order.push(c_arg);
2192    }
2193  } else {
2194    // Compute a valid move order, using tmp_vmreg to break any cycles
2195    ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
2196  }
2197
2198  int temploc = -1;
2199  for (int ai = 0; ai < arg_order.length(); ai += 2) {
2200    int i = arg_order.at(ai);
2201    int c_arg = arg_order.at(ai + 1);
2202    __ block_comment(err_msg("move %d -> %d", i, c_arg));
2203    if (c_arg == -1) {
2204      assert(is_critical_native, "should only be required for critical natives");
2205      // This arg needs to be moved to a temporary
2206      __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
2207      in_regs[i] = tmp_vmreg;
2208      temploc = i;
2209      continue;
2210    } else if (i == -1) {
2211      assert(is_critical_native, "should only be required for critical natives");
2212      // Read from the temporary location
2213      assert(temploc != -1, "must be valid");
2214      i = temploc;
2215      temploc = -1;
2216    }
2217#ifdef ASSERT
2218    if (in_regs[i].first()->is_Register()) {
2219      assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2220    } else if (in_regs[i].first()->is_XMMRegister()) {
2221      assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2222    }
2223    if (out_regs[c_arg].first()->is_Register()) {
2224      reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2225    } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2226      freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2227    }
2228#endif /* ASSERT */
2229    switch (in_sig_bt[i]) {
2230      case T_ARRAY:
2231        if (is_critical_native) {
2232          unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
2233          c_arg++;
2234#ifdef ASSERT
2235          if (out_regs[c_arg].first()->is_Register()) {
2236            reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2237          } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2238            freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2239          }
2240#endif
2241          break;
2242        }
2243      case T_OBJECT:
2244        assert(!is_critical_native, "no oop arguments");
2245        object_move(masm, map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2246                    ((i == 0) && (!is_static)),
2247                    &receiver_offset);
2248        break;
2249      case T_VOID:
2250        break;
2251
2252      case T_FLOAT:
2253        float_move(masm, in_regs[i], out_regs[c_arg]);
2254          break;
2255
2256      case T_DOUBLE:
2257        assert( i + 1 < total_in_args &&
2258                in_sig_bt[i + 1] == T_VOID &&
2259                out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2260        double_move(masm, in_regs[i], out_regs[c_arg]);
2261        break;
2262
2263      case T_LONG :
2264        long_move(masm, in_regs[i], out_regs[c_arg]);
2265        break;
2266
2267      case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2268
2269      default:
2270        move32_64(masm, in_regs[i], out_regs[c_arg]);
2271    }
2272  }
2273
2274  int c_arg;
2275
2276  // Pre-load a static method's oop into r14.  Used both by locking code and
2277  // the normal JNI call code.
2278  if (!is_critical_native) {
2279    // point c_arg at the first arg that is already loaded in case we
2280    // need to spill before we call out
2281    c_arg = total_c_args - total_in_args;
2282
2283    if (method->is_static()) {
2284
2285      //  load oop into a register
2286      __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2287
2288      // Now handlize the static class mirror it's known not-null.
2289      __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2290      map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2291
2292      // Now get the handle
2293      __ lea(oop_handle_reg, Address(rsp, klass_offset));
2294      // store the klass handle as second argument
2295      __ movptr(c_rarg1, oop_handle_reg);
2296      // and protect the arg if we must spill
2297      c_arg--;
2298    }
2299  } else {
2300    // For JNI critical methods we need to save all registers in save_args.
2301    c_arg = 0;
2302  }
2303
2304  // Change state to native (we save the return address in the thread, since it might not
2305  // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
2306  // points into the right code segment. It does not have to be the correct return pc.
2307  // We use the same pc/oopMap repeatedly when we call out
2308
2309  intptr_t the_pc = (intptr_t) __ pc();
2310  oop_maps->add_gc_map(the_pc - start, map);
2311
2312  __ set_last_Java_frame(rsp, noreg, (address)the_pc);
2313
2314
2315  // We have all of the arguments setup at this point. We must not touch any register
2316  // argument registers at this point (what if we save/restore them there are no oop?
2317
2318  {
2319    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2320    // protect the args we've loaded
2321    save_args(masm, total_c_args, c_arg, out_regs);
2322    __ mov_metadata(c_rarg1, method());
2323    __ call_VM_leaf(
2324      CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2325      r15_thread, c_rarg1);
2326    restore_args(masm, total_c_args, c_arg, out_regs);
2327  }
2328
2329  // RedefineClasses() tracing support for obsolete method entry
2330  if (log_is_enabled(Trace, redefine, class, obsolete)) {
2331    // protect the args we've loaded
2332    save_args(masm, total_c_args, c_arg, out_regs);
2333    __ mov_metadata(c_rarg1, method());
2334    __ call_VM_leaf(
2335      CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2336      r15_thread, c_rarg1);
2337    restore_args(masm, total_c_args, c_arg, out_regs);
2338  }
2339
2340  // Lock a synchronized method
2341
2342  // Register definitions used by locking and unlocking
2343
2344  const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2345  const Register obj_reg  = rbx;  // Will contain the oop
2346  const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2347  const Register old_hdr  = r13;  // value of old header at unlock time
2348
2349  Label slow_path_lock;
2350  Label lock_done;
2351
2352  if (method->is_synchronized()) {
2353    assert(!is_critical_native, "unhandled");
2354
2355
2356    const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2357
2358    // Get the handle (the 2nd argument)
2359    __ mov(oop_handle_reg, c_rarg1);
2360
2361    // Get address of the box
2362
2363    __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2364
2365    // Load the oop from the handle
2366    __ movptr(obj_reg, Address(oop_handle_reg, 0));
2367
2368    if (UseBiasedLocking) {
2369      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, false, lock_done, &slow_path_lock);
2370    }
2371
2372    // Load immediate 1 into swap_reg %rax
2373    __ movl(swap_reg, 1);
2374
2375    // Load (object->mark() | 1) into swap_reg %rax
2376    __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2377
2378    // Save (object->mark() | 1) into BasicLock's displaced header
2379    __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2380
2381    if (os::is_MP()) {
2382      __ lock();
2383    }
2384
2385    // src -> dest iff dest == rax else rax <- dest
2386    __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2387    __ jcc(Assembler::equal, lock_done);
2388
2389    // Hmm should this move to the slow path code area???
2390
2391    // Test if the oopMark is an obvious stack pointer, i.e.,
2392    //  1) (mark & 3) == 0, and
2393    //  2) rsp <= mark < mark + os::pagesize()
2394    // These 3 tests can be done by evaluating the following
2395    // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2396    // assuming both stack pointer and pagesize have their
2397    // least significant 2 bits clear.
2398    // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2399
2400    __ subptr(swap_reg, rsp);
2401    __ andptr(swap_reg, 3 - os::vm_page_size());
2402
2403    // Save the test result, for recursive case, the result is zero
2404    __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2405    __ jcc(Assembler::notEqual, slow_path_lock);
2406
2407    // Slow path will re-enter here
2408
2409    __ bind(lock_done);
2410  }
2411
2412
2413  // Finally just about ready to make the JNI call
2414
2415
2416  // get JNIEnv* which is first argument to native
2417  if (!is_critical_native) {
2418    __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2419  }
2420
2421  // Now set thread in native
2422  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2423
2424  __ call(RuntimeAddress(native_func));
2425
2426  // Verify or restore cpu control state after JNI call
2427  __ restore_cpu_control_state_after_jni();
2428
2429  // Unpack native results.
2430  switch (ret_type) {
2431  case T_BOOLEAN: __ c2bool(rax);            break;
2432  case T_CHAR   : __ movzwl(rax, rax);      break;
2433  case T_BYTE   : __ sign_extend_byte (rax); break;
2434  case T_SHORT  : __ sign_extend_short(rax); break;
2435  case T_INT    : /* nothing to do */        break;
2436  case T_DOUBLE :
2437  case T_FLOAT  :
2438    // Result is in xmm0 we'll save as needed
2439    break;
2440  case T_ARRAY:                 // Really a handle
2441  case T_OBJECT:                // Really a handle
2442      break; // can't de-handlize until after safepoint check
2443  case T_VOID: break;
2444  case T_LONG: break;
2445  default       : ShouldNotReachHere();
2446  }
2447
2448  // Switch thread to "native transition" state before reading the synchronization state.
2449  // This additional state is necessary because reading and testing the synchronization
2450  // state is not atomic w.r.t. GC, as this scenario demonstrates:
2451  //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2452  //     VM thread changes sync state to synchronizing and suspends threads for GC.
2453  //     Thread A is resumed to finish this native method, but doesn't block here since it
2454  //     didn't see any synchronization is progress, and escapes.
2455  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2456
2457  if(os::is_MP()) {
2458    if (UseMembar) {
2459      // Force this write out before the read below
2460      __ membar(Assembler::Membar_mask_bits(
2461           Assembler::LoadLoad | Assembler::LoadStore |
2462           Assembler::StoreLoad | Assembler::StoreStore));
2463    } else {
2464      // Write serialization page so VM thread can do a pseudo remote membar.
2465      // We use the current thread pointer to calculate a thread specific
2466      // offset to write to within the page. This minimizes bus traffic
2467      // due to cache line collision.
2468      __ serialize_memory(r15_thread, rcx);
2469    }
2470  }
2471
2472  Label after_transition;
2473
2474  // check for safepoint operation in progress and/or pending suspend requests
2475  {
2476    Label Continue;
2477
2478    __ cmp32(ExternalAddress((address)SafepointSynchronize::address_of_state()),
2479             SafepointSynchronize::_not_synchronized);
2480
2481    Label L;
2482    __ jcc(Assembler::notEqual, L);
2483    __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2484    __ jcc(Assembler::equal, Continue);
2485    __ bind(L);
2486
2487    // Don't use call_VM as it will see a possible pending exception and forward it
2488    // and never return here preventing us from clearing _last_native_pc down below.
2489    // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2490    // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2491    // by hand.
2492    //
2493    __ vzeroupper();
2494    save_native_result(masm, ret_type, stack_slots);
2495    __ mov(c_rarg0, r15_thread);
2496    __ mov(r12, rsp); // remember sp
2497    __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2498    __ andptr(rsp, -16); // align stack as required by ABI
2499    if (!is_critical_native) {
2500      __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2501    } else {
2502      __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition)));
2503    }
2504    __ mov(rsp, r12); // restore sp
2505    __ reinit_heapbase();
2506    // Restore any method result value
2507    restore_native_result(masm, ret_type, stack_slots);
2508
2509    if (is_critical_native) {
2510      // The call above performed the transition to thread_in_Java so
2511      // skip the transition logic below.
2512      __ jmpb(after_transition);
2513    }
2514
2515    __ bind(Continue);
2516  }
2517
2518  // change thread state
2519  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2520  __ bind(after_transition);
2521
2522  Label reguard;
2523  Label reguard_done;
2524  __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), JavaThread::stack_guard_yellow_reserved_disabled);
2525  __ jcc(Assembler::equal, reguard);
2526  __ bind(reguard_done);
2527
2528  // native result if any is live
2529
2530  // Unlock
2531  Label unlock_done;
2532  Label slow_path_unlock;
2533  if (method->is_synchronized()) {
2534
2535    // Get locked oop from the handle we passed to jni
2536    __ movptr(obj_reg, Address(oop_handle_reg, 0));
2537
2538    Label done;
2539
2540    if (UseBiasedLocking) {
2541      __ biased_locking_exit(obj_reg, old_hdr, done);
2542    }
2543
2544    // Simple recursive lock?
2545
2546    __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2547    __ jcc(Assembler::equal, done);
2548
2549    // Must save rax if if it is live now because cmpxchg must use it
2550    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2551      save_native_result(masm, ret_type, stack_slots);
2552    }
2553
2554
2555    // get address of the stack lock
2556    __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2557    //  get old displaced header
2558    __ movptr(old_hdr, Address(rax, 0));
2559
2560    // Atomic swap old header if oop still contains the stack lock
2561    if (os::is_MP()) {
2562      __ lock();
2563    }
2564    __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2565    __ jcc(Assembler::notEqual, slow_path_unlock);
2566
2567    // slow path re-enters here
2568    __ bind(unlock_done);
2569    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2570      restore_native_result(masm, ret_type, stack_slots);
2571    }
2572
2573    __ bind(done);
2574
2575  }
2576  {
2577    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2578    save_native_result(masm, ret_type, stack_slots);
2579    __ mov_metadata(c_rarg1, method());
2580    __ call_VM_leaf(
2581         CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2582         r15_thread, c_rarg1);
2583    restore_native_result(masm, ret_type, stack_slots);
2584  }
2585
2586  __ reset_last_Java_frame(false);
2587
2588  // Unbox oop result, e.g. JNIHandles::resolve value.
2589  if (ret_type == T_OBJECT || ret_type == T_ARRAY) {
2590    __ resolve_jobject(rax /* value */,
2591                       r15_thread /* thread */,
2592                       rcx /* tmp */);
2593  }
2594
2595  if (CheckJNICalls) {
2596    // clear_pending_jni_exception_check
2597    __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2598  }
2599
2600  if (!is_critical_native) {
2601    // reset handle block
2602    __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2603    __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2604  }
2605
2606  // pop our frame
2607
2608  __ leave();
2609
2610  if (!is_critical_native) {
2611    // Any exception pending?
2612    __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2613    __ jcc(Assembler::notEqual, exception_pending);
2614  }
2615
2616  // Return
2617
2618  __ ret(0);
2619
2620  // Unexpected paths are out of line and go here
2621
2622  if (!is_critical_native) {
2623    // forward the exception
2624    __ bind(exception_pending);
2625
2626    // and forward the exception
2627    __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2628  }
2629
2630  // Slow path locking & unlocking
2631  if (method->is_synchronized()) {
2632
2633    // BEGIN Slow path lock
2634    __ bind(slow_path_lock);
2635
2636    // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2637    // args are (oop obj, BasicLock* lock, JavaThread* thread)
2638
2639    // protect the args we've loaded
2640    save_args(masm, total_c_args, c_arg, out_regs);
2641
2642    __ mov(c_rarg0, obj_reg);
2643    __ mov(c_rarg1, lock_reg);
2644    __ mov(c_rarg2, r15_thread);
2645
2646    // Not a leaf but we have last_Java_frame setup as we want
2647    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2648    restore_args(masm, total_c_args, c_arg, out_regs);
2649
2650#ifdef ASSERT
2651    { Label L;
2652    __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2653    __ jcc(Assembler::equal, L);
2654    __ stop("no pending exception allowed on exit from monitorenter");
2655    __ bind(L);
2656    }
2657#endif
2658    __ jmp(lock_done);
2659
2660    // END Slow path lock
2661
2662    // BEGIN Slow path unlock
2663    __ bind(slow_path_unlock);
2664
2665    // If we haven't already saved the native result we must save it now as xmm registers
2666    // are still exposed.
2667    __ vzeroupper();
2668    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2669      save_native_result(masm, ret_type, stack_slots);
2670    }
2671
2672    __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2673
2674    __ mov(c_rarg0, obj_reg);
2675    __ mov(c_rarg2, r15_thread);
2676    __ mov(r12, rsp); // remember sp
2677    __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2678    __ andptr(rsp, -16); // align stack as required by ABI
2679
2680    // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2681    // NOTE that obj_reg == rbx currently
2682    __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2683    __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2684
2685    // args are (oop obj, BasicLock* lock, JavaThread* thread)
2686    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2687    __ mov(rsp, r12); // restore sp
2688    __ reinit_heapbase();
2689#ifdef ASSERT
2690    {
2691      Label L;
2692      __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2693      __ jcc(Assembler::equal, L);
2694      __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2695      __ bind(L);
2696    }
2697#endif /* ASSERT */
2698
2699    __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2700
2701    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2702      restore_native_result(masm, ret_type, stack_slots);
2703    }
2704    __ jmp(unlock_done);
2705
2706    // END Slow path unlock
2707
2708  } // synchronized
2709
2710  // SLOW PATH Reguard the stack if needed
2711
2712  __ bind(reguard);
2713  __ vzeroupper();
2714  save_native_result(masm, ret_type, stack_slots);
2715  __ mov(r12, rsp); // remember sp
2716  __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2717  __ andptr(rsp, -16); // align stack as required by ABI
2718  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2719  __ mov(rsp, r12); // restore sp
2720  __ reinit_heapbase();
2721  restore_native_result(masm, ret_type, stack_slots);
2722  // and continue
2723  __ jmp(reguard_done);
2724
2725
2726
2727  __ flush();
2728
2729  nmethod *nm = nmethod::new_native_nmethod(method,
2730                                            compile_id,
2731                                            masm->code(),
2732                                            vep_offset,
2733                                            frame_complete,
2734                                            stack_slots / VMRegImpl::slots_per_word,
2735                                            (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2736                                            in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2737                                            oop_maps);
2738
2739  if (is_critical_native) {
2740    nm->set_lazy_critical_native(true);
2741  }
2742
2743  return nm;
2744
2745}
2746
2747// this function returns the adjust size (in number of words) to a c2i adapter
2748// activation for use during deoptimization
2749int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2750  return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2751}
2752
2753
2754uint SharedRuntime::out_preserve_stack_slots() {
2755  return 0;
2756}
2757
2758//------------------------------generate_deopt_blob----------------------------
2759void SharedRuntime::generate_deopt_blob() {
2760  // Allocate space for the code
2761  ResourceMark rm;
2762  // Setup code generation tools
2763  int pad = 0;
2764#if INCLUDE_JVMCI
2765  if (EnableJVMCI || UseAOT) {
2766    pad += 512; // Increase the buffer size when compiling for JVMCI
2767  }
2768#endif
2769  CodeBuffer buffer("deopt_blob", 2048+pad, 1024);
2770  MacroAssembler* masm = new MacroAssembler(&buffer);
2771  int frame_size_in_words;
2772  OopMap* map = NULL;
2773  OopMapSet *oop_maps = new OopMapSet();
2774
2775  // -------------
2776  // This code enters when returning to a de-optimized nmethod.  A return
2777  // address has been pushed on the the stack, and return values are in
2778  // registers.
2779  // If we are doing a normal deopt then we were called from the patched
2780  // nmethod from the point we returned to the nmethod. So the return
2781  // address on the stack is wrong by NativeCall::instruction_size
2782  // We will adjust the value so it looks like we have the original return
2783  // address on the stack (like when we eagerly deoptimized).
2784  // In the case of an exception pending when deoptimizing, we enter
2785  // with a return address on the stack that points after the call we patched
2786  // into the exception handler. We have the following register state from,
2787  // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2788  //    rax: exception oop
2789  //    rbx: exception handler
2790  //    rdx: throwing pc
2791  // So in this case we simply jam rdx into the useless return address and
2792  // the stack looks just like we want.
2793  //
2794  // At this point we need to de-opt.  We save the argument return
2795  // registers.  We call the first C routine, fetch_unroll_info().  This
2796  // routine captures the return values and returns a structure which
2797  // describes the current frame size and the sizes of all replacement frames.
2798  // The current frame is compiled code and may contain many inlined
2799  // functions, each with their own JVM state.  We pop the current frame, then
2800  // push all the new frames.  Then we call the C routine unpack_frames() to
2801  // populate these frames.  Finally unpack_frames() returns us the new target
2802  // address.  Notice that callee-save registers are BLOWN here; they have
2803  // already been captured in the vframeArray at the time the return PC was
2804  // patched.
2805  address start = __ pc();
2806  Label cont;
2807
2808  // Prolog for non exception case!
2809
2810  // Save everything in sight.
2811  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
2812
2813  // Normal deoptimization.  Save exec mode for unpack_frames.
2814  __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2815  __ jmp(cont);
2816
2817  int reexecute_offset = __ pc() - start;
2818#if INCLUDE_JVMCI && !defined(COMPILER1)
2819  if (EnableJVMCI && UseJVMCICompiler) {
2820    // JVMCI does not use this kind of deoptimization
2821    __ should_not_reach_here();
2822  }
2823#endif
2824
2825  // Reexecute case
2826  // return address is the pc describes what bci to do re-execute at
2827
2828  // No need to update map as each call to save_live_registers will produce identical oopmap
2829  (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
2830
2831  __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2832  __ jmp(cont);
2833
2834#if INCLUDE_JVMCI
2835  Label after_fetch_unroll_info_call;
2836  int implicit_exception_uncommon_trap_offset = 0;
2837  int uncommon_trap_offset = 0;
2838
2839  if (EnableJVMCI || UseAOT) {
2840    implicit_exception_uncommon_trap_offset = __ pc() - start;
2841
2842    __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2843    __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2844
2845    uncommon_trap_offset = __ pc() - start;
2846
2847    // Save everything in sight.
2848    RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
2849    // fetch_unroll_info needs to call last_java_frame()
2850    __ set_last_Java_frame(noreg, noreg, NULL);
2851
2852    __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2853    __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2854
2855    __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2856    __ mov(c_rarg0, r15_thread);
2857    __ movl(c_rarg2, r14); // exec mode
2858    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2859    oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2860
2861    __ reset_last_Java_frame(false);
2862
2863    __ jmp(after_fetch_unroll_info_call);
2864  } // EnableJVMCI
2865#endif // INCLUDE_JVMCI
2866
2867  int exception_offset = __ pc() - start;
2868
2869  // Prolog for exception case
2870
2871  // all registers are dead at this entry point, except for rax, and
2872  // rdx which contain the exception oop and exception pc
2873  // respectively.  Set them in TLS and fall thru to the
2874  // unpack_with_exception_in_tls entry point.
2875
2876  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2877  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2878
2879  int exception_in_tls_offset = __ pc() - start;
2880
2881  // new implementation because exception oop is now passed in JavaThread
2882
2883  // Prolog for exception case
2884  // All registers must be preserved because they might be used by LinearScan
2885  // Exceptiop oop and throwing PC are passed in JavaThread
2886  // tos: stack at point of call to method that threw the exception (i.e. only
2887  // args are on the stack, no return address)
2888
2889  // make room on stack for the return address
2890  // It will be patched later with the throwing pc. The correct value is not
2891  // available now because loading it from memory would destroy registers.
2892  __ push(0);
2893
2894  // Save everything in sight.
2895  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
2896
2897  // Now it is safe to overwrite any register
2898
2899  // Deopt during an exception.  Save exec mode for unpack_frames.
2900  __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2901
2902  // load throwing pc from JavaThread and patch it as the return address
2903  // of the current frame. Then clear the field in JavaThread
2904
2905  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2906  __ movptr(Address(rbp, wordSize), rdx);
2907  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2908
2909#ifdef ASSERT
2910  // verify that there is really an exception oop in JavaThread
2911  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2912  __ verify_oop(rax);
2913
2914  // verify that there is no pending exception
2915  Label no_pending_exception;
2916  __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2917  __ testptr(rax, rax);
2918  __ jcc(Assembler::zero, no_pending_exception);
2919  __ stop("must not have pending exception here");
2920  __ bind(no_pending_exception);
2921#endif
2922
2923  __ bind(cont);
2924
2925  // Call C code.  Need thread and this frame, but NOT official VM entry
2926  // crud.  We cannot block on this call, no GC can happen.
2927  //
2928  // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2929
2930  // fetch_unroll_info needs to call last_java_frame().
2931
2932  __ set_last_Java_frame(noreg, noreg, NULL);
2933#ifdef ASSERT
2934  { Label L;
2935    __ cmpptr(Address(r15_thread,
2936                    JavaThread::last_Java_fp_offset()),
2937            (int32_t)0);
2938    __ jcc(Assembler::equal, L);
2939    __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2940    __ bind(L);
2941  }
2942#endif // ASSERT
2943  __ mov(c_rarg0, r15_thread);
2944  __ movl(c_rarg1, r14); // exec_mode
2945  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2946
2947  // Need to have an oopmap that tells fetch_unroll_info where to
2948  // find any register it might need.
2949  oop_maps->add_gc_map(__ pc() - start, map);
2950
2951  __ reset_last_Java_frame(false);
2952
2953#if INCLUDE_JVMCI
2954  if (EnableJVMCI || UseAOT) {
2955    __ bind(after_fetch_unroll_info_call);
2956  }
2957#endif
2958
2959  // Load UnrollBlock* into rdi
2960  __ mov(rdi, rax);
2961
2962  __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2963   Label noException;
2964  __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2965  __ jcc(Assembler::notEqual, noException);
2966  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2967  // QQQ this is useless it was NULL above
2968  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2969  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2970  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2971
2972  __ verify_oop(rax);
2973
2974  // Overwrite the result registers with the exception results.
2975  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2976  // I think this is useless
2977  __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2978
2979  __ bind(noException);
2980
2981  // Only register save data is on the stack.
2982  // Now restore the result registers.  Everything else is either dead
2983  // or captured in the vframeArray.
2984  RegisterSaver::restore_result_registers(masm);
2985
2986  // All of the register save area has been popped of the stack. Only the
2987  // return address remains.
2988
2989  // Pop all the frames we must move/replace.
2990  //
2991  // Frame picture (youngest to oldest)
2992  // 1: self-frame (no frame link)
2993  // 2: deopting frame  (no frame link)
2994  // 3: caller of deopting frame (could be compiled/interpreted).
2995  //
2996  // Note: by leaving the return address of self-frame on the stack
2997  // and using the size of frame 2 to adjust the stack
2998  // when we are done the return to frame 3 will still be on the stack.
2999
3000  // Pop deoptimized frame
3001  __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
3002  __ addptr(rsp, rcx);
3003
3004  // rsp should be pointing at the return address to the caller (3)
3005
3006  // Pick up the initial fp we should save
3007  // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3008  __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
3009
3010#ifdef ASSERT
3011  // Compilers generate code that bang the stack by as much as the
3012  // interpreter would need. So this stack banging should never
3013  // trigger a fault. Verify that it does not on non product builds.
3014  if (UseStackBanging) {
3015    __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
3016    __ bang_stack_size(rbx, rcx);
3017  }
3018#endif
3019
3020  // Load address of array of frame pcs into rcx
3021  __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
3022
3023  // Trash the old pc
3024  __ addptr(rsp, wordSize);
3025
3026  // Load address of array of frame sizes into rsi
3027  __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
3028
3029  // Load counter into rdx
3030  __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
3031
3032  // Now adjust the caller's stack to make up for the extra locals
3033  // but record the original sp so that we can save it in the skeletal interpreter
3034  // frame and the stack walking of interpreter_sender will get the unextended sp
3035  // value and not the "real" sp value.
3036
3037  const Register sender_sp = r8;
3038
3039  __ mov(sender_sp, rsp);
3040  __ movl(rbx, Address(rdi,
3041                       Deoptimization::UnrollBlock::
3042                       caller_adjustment_offset_in_bytes()));
3043  __ subptr(rsp, rbx);
3044
3045  // Push interpreter frames in a loop
3046  Label loop;
3047  __ bind(loop);
3048  __ movptr(rbx, Address(rsi, 0));      // Load frame size
3049  __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3050  __ pushptr(Address(rcx, 0));          // Save return address
3051  __ enter();                           // Save old & set new ebp
3052  __ subptr(rsp, rbx);                  // Prolog
3053  // This value is corrected by layout_activation_impl
3054  __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
3055  __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3056  __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3057  __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3058  __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3059  __ decrementl(rdx);                   // Decrement counter
3060  __ jcc(Assembler::notZero, loop);
3061  __ pushptr(Address(rcx, 0));          // Save final return address
3062
3063  // Re-push self-frame
3064  __ enter();                           // Save old & set new ebp
3065
3066  // Allocate a full sized register save area.
3067  // Return address and rbp are in place, so we allocate two less words.
3068  __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3069
3070  // Restore frame locals after moving the frame
3071  __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3072  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3073
3074  // Call C code.  Need thread but NOT official VM entry
3075  // crud.  We cannot block on this call, no GC can happen.  Call should
3076  // restore return values to their stack-slots with the new SP.
3077  //
3078  // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3079
3080  // Use rbp because the frames look interpreted now
3081  // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3082  // Don't need the precise return PC here, just precise enough to point into this code blob.
3083  address the_pc = __ pc();
3084  __ set_last_Java_frame(noreg, rbp, the_pc);
3085
3086  __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3087  __ mov(c_rarg0, r15_thread);
3088  __ movl(c_rarg1, r14); // second arg: exec_mode
3089  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3090  // Revert SP alignment after call since we're going to do some SP relative addressing below
3091  __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3092
3093  // Set an oopmap for the call site
3094  // Use the same PC we used for the last java frame
3095  oop_maps->add_gc_map(the_pc - start,
3096                       new OopMap( frame_size_in_words, 0 ));
3097
3098  // Clear fp AND pc
3099  __ reset_last_Java_frame(true);
3100
3101  // Collect return values
3102  __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3103  __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3104  // I think this is useless (throwing pc?)
3105  __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3106
3107  // Pop self-frame.
3108  __ leave();                           // Epilog
3109
3110  // Jump to interpreter
3111  __ ret(0);
3112
3113  // Make sure all code is generated
3114  masm->flush();
3115
3116  _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3117  _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3118#if INCLUDE_JVMCI
3119  if (EnableJVMCI || UseAOT) {
3120    _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3121    _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3122  }
3123#endif
3124}
3125
3126#ifdef COMPILER2
3127//------------------------------generate_uncommon_trap_blob--------------------
3128void SharedRuntime::generate_uncommon_trap_blob() {
3129  // Allocate space for the code
3130  ResourceMark rm;
3131  // Setup code generation tools
3132  CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
3133  MacroAssembler* masm = new MacroAssembler(&buffer);
3134
3135  assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3136
3137  address start = __ pc();
3138
3139  if (UseRTMLocking) {
3140    // Abort RTM transaction before possible nmethod deoptimization.
3141    __ xabort(0);
3142  }
3143
3144  // Push self-frame.  We get here with a return address on the
3145  // stack, so rsp is 8-byte aligned until we allocate our frame.
3146  __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
3147
3148  // No callee saved registers. rbp is assumed implicitly saved
3149  __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3150
3151  // compiler left unloaded_class_index in j_rarg0 move to where the
3152  // runtime expects it.
3153  __ movl(c_rarg1, j_rarg0);
3154
3155  __ set_last_Java_frame(noreg, noreg, NULL);
3156
3157  // Call C code.  Need thread but NOT official VM entry
3158  // crud.  We cannot block on this call, no GC can happen.  Call should
3159  // capture callee-saved registers as well as return values.
3160  // Thread is in rdi already.
3161  //
3162  // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
3163
3164  __ mov(c_rarg0, r15_thread);
3165  __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
3166  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
3167
3168  // Set an oopmap for the call site
3169  OopMapSet* oop_maps = new OopMapSet();
3170  OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
3171
3172  // location of rbp is known implicitly by the frame sender code
3173
3174  oop_maps->add_gc_map(__ pc() - start, map);
3175
3176  __ reset_last_Java_frame(false);
3177
3178  // Load UnrollBlock* into rdi
3179  __ mov(rdi, rax);
3180
3181#ifdef ASSERT
3182  { Label L;
3183    __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
3184            (int32_t)Deoptimization::Unpack_uncommon_trap);
3185    __ jcc(Assembler::equal, L);
3186    __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
3187    __ bind(L);
3188  }
3189#endif
3190
3191  // Pop all the frames we must move/replace.
3192  //
3193  // Frame picture (youngest to oldest)
3194  // 1: self-frame (no frame link)
3195  // 2: deopting frame  (no frame link)
3196  // 3: caller of deopting frame (could be compiled/interpreted).
3197
3198  // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
3199  __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
3200
3201  // Pop deoptimized frame (int)
3202  __ movl(rcx, Address(rdi,
3203                       Deoptimization::UnrollBlock::
3204                       size_of_deoptimized_frame_offset_in_bytes()));
3205  __ addptr(rsp, rcx);
3206
3207  // rsp should be pointing at the return address to the caller (3)
3208
3209  // Pick up the initial fp we should save
3210  // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3211  __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
3212
3213#ifdef ASSERT
3214  // Compilers generate code that bang the stack by as much as the
3215  // interpreter would need. So this stack banging should never
3216  // trigger a fault. Verify that it does not on non product builds.
3217  if (UseStackBanging) {
3218    __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
3219    __ bang_stack_size(rbx, rcx);
3220  }
3221#endif
3222
3223  // Load address of array of frame pcs into rcx (address*)
3224  __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
3225
3226  // Trash the return pc
3227  __ addptr(rsp, wordSize);
3228
3229  // Load address of array of frame sizes into rsi (intptr_t*)
3230  __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
3231
3232  // Counter
3233  __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
3234
3235  // Now adjust the caller's stack to make up for the extra locals but
3236  // record the original sp so that we can save it in the skeletal
3237  // interpreter frame and the stack walking of interpreter_sender
3238  // will get the unextended sp value and not the "real" sp value.
3239
3240  const Register sender_sp = r8;
3241
3242  __ mov(sender_sp, rsp);
3243  __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
3244  __ subptr(rsp, rbx);
3245
3246  // Push interpreter frames in a loop
3247  Label loop;
3248  __ bind(loop);
3249  __ movptr(rbx, Address(rsi, 0)); // Load frame size
3250  __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3251  __ pushptr(Address(rcx, 0));     // Save return address
3252  __ enter();                      // Save old & set new rbp
3253  __ subptr(rsp, rbx);             // Prolog
3254  __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3255            sender_sp);            // Make it walkable
3256  // This value is corrected by layout_activation_impl
3257  __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
3258  __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3259  __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3260  __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3261  __ decrementl(rdx);              // Decrement counter
3262  __ jcc(Assembler::notZero, loop);
3263  __ pushptr(Address(rcx, 0));     // Save final return address
3264
3265  // Re-push self-frame
3266  __ enter();                 // Save old & set new rbp
3267  __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3268                              // Prolog
3269
3270  // Use rbp because the frames look interpreted now
3271  // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3272  // Don't need the precise return PC here, just precise enough to point into this code blob.
3273  address the_pc = __ pc();
3274  __ set_last_Java_frame(noreg, rbp, the_pc);
3275
3276  // Call C code.  Need thread but NOT official VM entry
3277  // crud.  We cannot block on this call, no GC can happen.  Call should
3278  // restore return values to their stack-slots with the new SP.
3279  // Thread is in rdi already.
3280  //
3281  // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3282
3283  __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3284  __ mov(c_rarg0, r15_thread);
3285  __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3286  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3287
3288  // Set an oopmap for the call site
3289  // Use the same PC we used for the last java frame
3290  oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3291
3292  // Clear fp AND pc
3293  __ reset_last_Java_frame(true);
3294
3295  // Pop self-frame.
3296  __ leave();                 // Epilog
3297
3298  // Jump to interpreter
3299  __ ret(0);
3300
3301  // Make sure all code is generated
3302  masm->flush();
3303
3304  _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3305                                                 SimpleRuntimeFrame::framesize >> 1);
3306}
3307#endif // COMPILER2
3308
3309
3310//------------------------------generate_handler_blob------
3311//
3312// Generate a special Compile2Runtime blob that saves all registers,
3313// and setup oopmap.
3314//
3315SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3316  assert(StubRoutines::forward_exception_entry() != NULL,
3317         "must be generated before");
3318
3319  ResourceMark rm;
3320  OopMapSet *oop_maps = new OopMapSet();
3321  OopMap* map;
3322
3323  // Allocate space for the code.  Setup code generation tools.
3324  CodeBuffer buffer("handler_blob", 2048, 1024);
3325  MacroAssembler* masm = new MacroAssembler(&buffer);
3326
3327  address start   = __ pc();
3328  address call_pc = NULL;
3329  int frame_size_in_words;
3330  bool cause_return = (poll_type == POLL_AT_RETURN);
3331  bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3332
3333  if (UseRTMLocking) {
3334    // Abort RTM transaction before calling runtime
3335    // because critical section will be large and will be
3336    // aborted anyway. Also nmethod could be deoptimized.
3337    __ xabort(0);
3338  }
3339
3340  // Make room for return address (or push it again)
3341  if (!cause_return) {
3342    __ push(rbx);
3343  }
3344
3345  // Save registers, fpu state, and flags
3346  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
3347
3348  // The following is basically a call_VM.  However, we need the precise
3349  // address of the call in order to generate an oopmap. Hence, we do all the
3350  // work outselves.
3351
3352  __ set_last_Java_frame(noreg, noreg, NULL);
3353
3354  // The return address must always be correct so that frame constructor never
3355  // sees an invalid pc.
3356
3357  if (!cause_return) {
3358    // overwrite the dummy value we pushed on entry
3359    __ movptr(c_rarg0, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3360    __ movptr(Address(rbp, wordSize), c_rarg0);
3361  }
3362
3363  // Do the call
3364  __ mov(c_rarg0, r15_thread);
3365  __ call(RuntimeAddress(call_ptr));
3366
3367  // Set an oopmap for the call site.  This oopmap will map all
3368  // oop-registers and debug-info registers as callee-saved.  This
3369  // will allow deoptimization at this safepoint to find all possible
3370  // debug-info recordings, as well as let GC find all oops.
3371
3372  oop_maps->add_gc_map( __ pc() - start, map);
3373
3374  Label noException;
3375
3376  __ reset_last_Java_frame(false);
3377
3378  __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3379  __ jcc(Assembler::equal, noException);
3380
3381  // Exception pending
3382
3383  RegisterSaver::restore_live_registers(masm, save_vectors);
3384
3385  __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3386
3387  // No exception case
3388  __ bind(noException);
3389
3390  // Normal exit, restore registers and exit.
3391  RegisterSaver::restore_live_registers(masm, save_vectors);
3392
3393  __ ret(0);
3394
3395  // Make sure all code is generated
3396  masm->flush();
3397
3398  // Fill-out other meta info
3399  return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3400}
3401
3402//
3403// generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3404//
3405// Generate a stub that calls into vm to find out the proper destination
3406// of a java call. All the argument registers are live at this point
3407// but since this is generic code we don't know what they are and the caller
3408// must do any gc of the args.
3409//
3410RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3411  assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3412
3413  // allocate space for the code
3414  ResourceMark rm;
3415
3416  CodeBuffer buffer(name, 1000, 512);
3417  MacroAssembler* masm                = new MacroAssembler(&buffer);
3418
3419  int frame_size_in_words;
3420
3421  OopMapSet *oop_maps = new OopMapSet();
3422  OopMap* map = NULL;
3423
3424  int start = __ offset();
3425
3426  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
3427
3428  int frame_complete = __ offset();
3429
3430  __ set_last_Java_frame(noreg, noreg, NULL);
3431
3432  __ mov(c_rarg0, r15_thread);
3433
3434  __ call(RuntimeAddress(destination));
3435
3436
3437  // Set an oopmap for the call site.
3438  // We need this not only for callee-saved registers, but also for volatile
3439  // registers that the compiler might be keeping live across a safepoint.
3440
3441  oop_maps->add_gc_map( __ offset() - start, map);
3442
3443  // rax contains the address we are going to jump to assuming no exception got installed
3444
3445  // clear last_Java_sp
3446  __ reset_last_Java_frame(false);
3447  // check for pending exceptions
3448  Label pending;
3449  __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3450  __ jcc(Assembler::notEqual, pending);
3451
3452  // get the returned Method*
3453  __ get_vm_result_2(rbx, r15_thread);
3454  __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3455
3456  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3457
3458  RegisterSaver::restore_live_registers(masm);
3459
3460  // We are back the the original state on entry and ready to go.
3461
3462  __ jmp(rax);
3463
3464  // Pending exception after the safepoint
3465
3466  __ bind(pending);
3467
3468  RegisterSaver::restore_live_registers(masm);
3469
3470  // exception pending => remove activation and forward to exception handler
3471
3472  __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3473
3474  __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3475  __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3476
3477  // -------------
3478  // make sure all code is generated
3479  masm->flush();
3480
3481  // return the  blob
3482  // frame_size_words or bytes??
3483  return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3484}
3485
3486
3487//------------------------------Montgomery multiplication------------------------
3488//
3489
3490#ifndef _WINDOWS
3491
3492#define ASM_SUBTRACT
3493
3494#ifdef ASM_SUBTRACT
3495// Subtract 0:b from carry:a.  Return carry.
3496static unsigned long
3497sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
3498  long i = 0, cnt = len;
3499  unsigned long tmp;
3500  asm volatile("clc; "
3501               "0: ; "
3502               "mov (%[b], %[i], 8), %[tmp]; "
3503               "sbb %[tmp], (%[a], %[i], 8); "
3504               "inc %[i]; dec %[cnt]; "
3505               "jne 0b; "
3506               "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3507               : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3508               : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3509               : "memory");
3510  return tmp;
3511}
3512#else // ASM_SUBTRACT
3513typedef int __attribute__((mode(TI))) int128;
3514
3515// Subtract 0:b from carry:a.  Return carry.
3516static unsigned long
3517sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
3518  int128 tmp = 0;
3519  int i;
3520  for (i = 0; i < len; i++) {
3521    tmp += a[i];
3522    tmp -= b[i];
3523    a[i] = tmp;
3524    tmp >>= 64;
3525    assert(-1 <= tmp && tmp <= 0, "invariant");
3526  }
3527  return tmp + carry;
3528}
3529#endif // ! ASM_SUBTRACT
3530
3531// Multiply (unsigned) Long A by Long B, accumulating the double-
3532// length result into the accumulator formed of T0, T1, and T2.
3533#define MACC(A, B, T0, T1, T2)                                  \
3534do {                                                            \
3535  unsigned long hi, lo;                                         \
3536  __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3537           : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3538           : "r"(A), "a"(B) : "cc");                            \
3539 } while(0)
3540
3541// As above, but add twice the double-length result into the
3542// accumulator.
3543#define MACC2(A, B, T0, T1, T2)                                 \
3544do {                                                            \
3545  unsigned long hi, lo;                                         \
3546  __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3547           "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3548           : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3549           : "r"(A), "a"(B) : "cc");                            \
3550 } while(0)
3551
3552// Fast Montgomery multiplication.  The derivation of the algorithm is
3553// in  A Cryptographic Library for the Motorola DSP56000,
3554// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3555
3556static void __attribute__((noinline))
3557montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
3558                    unsigned long m[], unsigned long inv, int len) {
3559  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3560  int i;
3561
3562  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3563
3564  for (i = 0; i < len; i++) {
3565    int j;
3566    for (j = 0; j < i; j++) {
3567      MACC(a[j], b[i-j], t0, t1, t2);
3568      MACC(m[j], n[i-j], t0, t1, t2);
3569    }
3570    MACC(a[i], b[0], t0, t1, t2);
3571    m[i] = t0 * inv;
3572    MACC(m[i], n[0], t0, t1, t2);
3573
3574    assert(t0 == 0, "broken Montgomery multiply");
3575
3576    t0 = t1; t1 = t2; t2 = 0;
3577  }
3578
3579  for (i = len; i < 2*len; i++) {
3580    int j;
3581    for (j = i-len+1; j < len; j++) {
3582      MACC(a[j], b[i-j], t0, t1, t2);
3583      MACC(m[j], n[i-j], t0, t1, t2);
3584    }
3585    m[i-len] = t0;
3586    t0 = t1; t1 = t2; t2 = 0;
3587  }
3588
3589  while (t0)
3590    t0 = sub(m, n, t0, len);
3591}
3592
3593// Fast Montgomery squaring.  This uses asymptotically 25% fewer
3594// multiplies so it should be up to 25% faster than Montgomery
3595// multiplication.  However, its loop control is more complex and it
3596// may actually run slower on some machines.
3597
3598static void __attribute__((noinline))
3599montgomery_square(unsigned long a[], unsigned long n[],
3600                  unsigned long m[], unsigned long inv, int len) {
3601  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3602  int i;
3603
3604  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3605
3606  for (i = 0; i < len; i++) {
3607    int j;
3608    int end = (i+1)/2;
3609    for (j = 0; j < end; j++) {
3610      MACC2(a[j], a[i-j], t0, t1, t2);
3611      MACC(m[j], n[i-j], t0, t1, t2);
3612    }
3613    if ((i & 1) == 0) {
3614      MACC(a[j], a[j], t0, t1, t2);
3615    }
3616    for (; j < i; j++) {
3617      MACC(m[j], n[i-j], t0, t1, t2);
3618    }
3619    m[i] = t0 * inv;
3620    MACC(m[i], n[0], t0, t1, t2);
3621
3622    assert(t0 == 0, "broken Montgomery square");
3623
3624    t0 = t1; t1 = t2; t2 = 0;
3625  }
3626
3627  for (i = len; i < 2*len; i++) {
3628    int start = i-len+1;
3629    int end = start + (len - start)/2;
3630    int j;
3631    for (j = start; j < end; j++) {
3632      MACC2(a[j], a[i-j], t0, t1, t2);
3633      MACC(m[j], n[i-j], t0, t1, t2);
3634    }
3635    if ((i & 1) == 0) {
3636      MACC(a[j], a[j], t0, t1, t2);
3637    }
3638    for (; j < len; j++) {
3639      MACC(m[j], n[i-j], t0, t1, t2);
3640    }
3641    m[i-len] = t0;
3642    t0 = t1; t1 = t2; t2 = 0;
3643  }
3644
3645  while (t0)
3646    t0 = sub(m, n, t0, len);
3647}
3648
3649// Swap words in a longword.
3650static unsigned long swap(unsigned long x) {
3651  return (x << 32) | (x >> 32);
3652}
3653
3654// Copy len longwords from s to d, word-swapping as we go.  The
3655// destination array is reversed.
3656static void reverse_words(unsigned long *s, unsigned long *d, int len) {
3657  d += len;
3658  while(len-- > 0) {
3659    d--;
3660    *d = swap(*s);
3661    s++;
3662  }
3663}
3664
3665// The threshold at which squaring is advantageous was determined
3666// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3667#define MONTGOMERY_SQUARING_THRESHOLD 64
3668
3669void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3670                                        jint len, jlong inv,
3671                                        jint *m_ints) {
3672  assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3673  int longwords = len/2;
3674
3675  // Make very sure we don't use so much space that the stack might
3676  // overflow.  512 jints corresponds to an 16384-bit integer and
3677  // will use here a total of 8k bytes of stack space.
3678  int total_allocation = longwords * sizeof (unsigned long) * 4;
3679  guarantee(total_allocation <= 8192, "must be");
3680  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
3681
3682  // Local scratch arrays
3683  unsigned long
3684    *a = scratch + 0 * longwords,
3685    *b = scratch + 1 * longwords,
3686    *n = scratch + 2 * longwords,
3687    *m = scratch + 3 * longwords;
3688
3689  reverse_words((unsigned long *)a_ints, a, longwords);
3690  reverse_words((unsigned long *)b_ints, b, longwords);
3691  reverse_words((unsigned long *)n_ints, n, longwords);
3692
3693  ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
3694
3695  reverse_words(m, (unsigned long *)m_ints, longwords);
3696}
3697
3698void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3699                                      jint len, jlong inv,
3700                                      jint *m_ints) {
3701  assert(len % 2 == 0, "array length in montgomery_square must be even");
3702  int longwords = len/2;
3703
3704  // Make very sure we don't use so much space that the stack might
3705  // overflow.  512 jints corresponds to an 16384-bit integer and
3706  // will use here a total of 6k bytes of stack space.
3707  int total_allocation = longwords * sizeof (unsigned long) * 3;
3708  guarantee(total_allocation <= 8192, "must be");
3709  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
3710
3711  // Local scratch arrays
3712  unsigned long
3713    *a = scratch + 0 * longwords,
3714    *n = scratch + 1 * longwords,
3715    *m = scratch + 2 * longwords;
3716
3717  reverse_words((unsigned long *)a_ints, a, longwords);
3718  reverse_words((unsigned long *)n_ints, n, longwords);
3719
3720  if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3721    ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
3722  } else {
3723    ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
3724  }
3725
3726  reverse_words(m, (unsigned long *)m_ints, longwords);
3727}
3728
3729#endif // WINDOWS
3730
3731#ifdef COMPILER2
3732// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3733//
3734//------------------------------generate_exception_blob---------------------------
3735// creates exception blob at the end
3736// Using exception blob, this code is jumped from a compiled method.
3737// (see emit_exception_handler in x86_64.ad file)
3738//
3739// Given an exception pc at a call we call into the runtime for the
3740// handler in this method. This handler might merely restore state
3741// (i.e. callee save registers) unwind the frame and jump to the
3742// exception handler for the nmethod if there is no Java level handler
3743// for the nmethod.
3744//
3745// This code is entered with a jmp.
3746//
3747// Arguments:
3748//   rax: exception oop
3749//   rdx: exception pc
3750//
3751// Results:
3752//   rax: exception oop
3753//   rdx: exception pc in caller or ???
3754//   destination: exception handler of caller
3755//
3756// Note: the exception pc MUST be at a call (precise debug information)
3757//       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3758//
3759
3760void OptoRuntime::generate_exception_blob() {
3761  assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3762  assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3763  assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3764
3765  assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3766
3767  // Allocate space for the code
3768  ResourceMark rm;
3769  // Setup code generation tools
3770  CodeBuffer buffer("exception_blob", 2048, 1024);
3771  MacroAssembler* masm = new MacroAssembler(&buffer);
3772
3773
3774  address start = __ pc();
3775
3776  // Exception pc is 'return address' for stack walker
3777  __ push(rdx);
3778  __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3779
3780  // Save callee-saved registers.  See x86_64.ad.
3781
3782  // rbp is an implicitly saved callee saved register (i.e., the calling
3783  // convention will save/restore it in the prolog/epilog). Other than that
3784  // there are no callee save registers now that adapter frames are gone.
3785
3786  __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3787
3788  // Store exception in Thread object. We cannot pass any arguments to the
3789  // handle_exception call, since we do not want to make any assumption
3790  // about the size of the frame where the exception happened in.
3791  // c_rarg0 is either rdi (Linux) or rcx (Windows).
3792  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3793  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3794
3795  // This call does all the hard work.  It checks if an exception handler
3796  // exists in the method.
3797  // If so, it returns the handler address.
3798  // If not, it prepares for stack-unwinding, restoring the callee-save
3799  // registers of the frame being removed.
3800  //
3801  // address OptoRuntime::handle_exception_C(JavaThread* thread)
3802
3803  // At a method handle call, the stack may not be properly aligned
3804  // when returning with an exception.
3805  address the_pc = __ pc();
3806  __ set_last_Java_frame(noreg, noreg, the_pc);
3807  __ mov(c_rarg0, r15_thread);
3808  __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3809  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3810
3811  // Set an oopmap for the call site.  This oopmap will only be used if we
3812  // are unwinding the stack.  Hence, all locations will be dead.
3813  // Callee-saved registers will be the same as the frame above (i.e.,
3814  // handle_exception_stub), since they were restored when we got the
3815  // exception.
3816
3817  OopMapSet* oop_maps = new OopMapSet();
3818
3819  oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3820
3821  __ reset_last_Java_frame(false);
3822
3823  // Restore callee-saved registers
3824
3825  // rbp is an implicitly saved callee-saved register (i.e., the calling
3826  // convention will save restore it in prolog/epilog) Other than that
3827  // there are no callee save registers now that adapter frames are gone.
3828
3829  __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3830
3831  __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3832  __ pop(rdx);                  // No need for exception pc anymore
3833
3834  // rax: exception handler
3835
3836  // We have a handler in rax (could be deopt blob).
3837  __ mov(r8, rax);
3838
3839  // Get the exception oop
3840  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3841  // Get the exception pc in case we are deoptimized
3842  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3843#ifdef ASSERT
3844  __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3845  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3846#endif
3847  // Clear the exception oop so GC no longer processes it as a root.
3848  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3849
3850  // rax: exception oop
3851  // r8:  exception handler
3852  // rdx: exception pc
3853  // Jump to handler
3854
3855  __ jmp(r8);
3856
3857  // Make sure all code is generated
3858  masm->flush();
3859
3860  // Set exception blob
3861  _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3862}
3863#endif // COMPILER2
3864