sharedRuntime_x86_64.cpp revision 9056:dc9930a04ab0
115103Sphk/*
215103Sphk * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
315103Sphk * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
415103Sphk *
515103Sphk * This code is free software; you can redistribute it and/or modify it
615103Sphk * under the terms of the GNU General Public License version 2 only, as
715103Sphk * published by the Free Software Foundation.
815103Sphk *
915103Sphk * This code is distributed in the hope that it will be useful, but WITHOUT
1015103Sphk * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
1115103Sphk * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1215103Sphk * version 2 for more details (a copy is included in the LICENSE file that
1315103Sphk * accompanied this code).
1415103Sphk *
1515103Sphk * You should have received a copy of the GNU General Public License version
1615103Sphk * 2 along with this work; if not, write to the Free Software Foundation,
1715103Sphk * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
1815103Sphk *
1915103Sphk * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
2015103Sphk * or visit www.oracle.com if you need additional information or have any
2115103Sphk * questions.
2215103Sphk *
2315103Sphk */
2415103Sphk
2515103Sphk#include "precompiled.hpp"
2615103Sphk#ifndef _WINDOWS
2715103Sphk#include "alloca.h"
2815103Sphk#endif
2915103Sphk#include "asm/macroAssembler.hpp"
3015103Sphk#include "asm/macroAssembler.inline.hpp"
3115103Sphk#include "code/debugInfoRec.hpp"
3215103Sphk#include "code/icBuffer.hpp"
3315103Sphk#include "code/vtableStubs.hpp"
3415103Sphk#include "interpreter/interpreter.hpp"
3515103Sphk#include "oops/compiledICHolder.hpp"
3615103Sphk#include "prims/jvmtiRedefineClassesTrace.hpp"
3715103Sphk#include "runtime/sharedRuntime.hpp"
38116182Sobrien#include "runtime/vframeArray.hpp"
39116182Sobrien#include "vmreg_x86.inline.hpp"
40116182Sobrien#ifdef COMPILER1
4186190Srwatson#include "c1/c1_Runtime1.hpp"
42169604Swkoszek#endif
4384611Srwatson#ifdef COMPILER2
4415103Sphk#include "opto/runtime.hpp"
4515103Sphk#endif
46169507Swkoszek
4715103Sphk#define __ masm->
4815103Sphk
4915103Sphkconst int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
5087275Srwatson
5187275Srwatsonclass SimpleRuntimeFrame {
5246155Sphk
5376078Sjhb  public:
54105046Smike
5528918Skato  // Most of the runtime stubs have this simple frame layout.
5615103Sphk  // This class exists to make the layout shared in one place.
5715103Sphk  // Offsets are for compiler stack slots, which are jints.
5815103Sphk  enum layout {
5915103Sphk    // The frame sender code expects that rbp will be in the "natural" place and
6015103Sphk    // will override any oopMap setting for it. We must therefore force the layout
6115103Sphk    // so that it agrees with the frame sender code.
6223382Sbde    rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
6315103Sphk    rbp_off2,
6415103Sphk    return_off, return_off2,
6515103Sphk    framesize
6615103Sphk  };
6715103Sphk};
6848891Sphk
6948891Sphkclass RegisterSaver {
7015103Sphk  // Capture info about frame layout.  Layout offsets are in jint
7115103Sphk  // units because compiler frame slots are jints.
7215103Sphk#define HALF_ZMM_BANK_WORDS 128
7315103Sphk#define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
7415103Sphk#define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
7515103Sphk  enum layout {
7634925Sdufault    fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
7734925Sdufault    xmm_off       = fpu_state_off + 160/BytesPerInt,            // offset in fxsave save area
7834029Sdufault    DEF_XMM_OFFS(0),
7950465Smarcel    DEF_XMM_OFFS(1),
8050465Smarcel    DEF_XMM_OFFS(2),
8189414Sarr    DEF_XMM_OFFS(3),
8289414Sarr    DEF_XMM_OFFS(4),
8386190Srwatson    DEF_XMM_OFFS(5),
8486190Srwatson    DEF_XMM_OFFS(6),
8586190Srwatson    DEF_XMM_OFFS(7),
8686190Srwatson    DEF_XMM_OFFS(8),
8750465Smarcel    DEF_XMM_OFFS(9),
88116090Sjmallett    DEF_XMM_OFFS(10),
89116105Sjmallett    DEF_XMM_OFFS(11),
90116090Sjmallett    DEF_XMM_OFFS(12),
9186189Srwatson    DEF_XMM_OFFS(13),
9288019Sluigi    DEF_XMM_OFFS(14),
9315103Sphk    DEF_XMM_OFFS(15),
9486189Srwatson    zmm_off = fpu_state_off + ((FPUStateSizeInWords - (HALF_ZMM_BANK_WORDS + 1))*wordSize / BytesPerInt),
9546381Sbillf    DEF_ZMM_OFFS(16),
9615103Sphk    DEF_ZMM_OFFS(17),
9786189Srwatson    DEF_ZMM_OFFS(18),
9846381Sbillf    DEF_ZMM_OFFS(19),
9915103Sphk    DEF_ZMM_OFFS(20),
10086189Srwatson    DEF_ZMM_OFFS(21),
10146381Sbillf    DEF_ZMM_OFFS(22),
10215103Sphk    DEF_ZMM_OFFS(23),
103119203Seivind    DEF_ZMM_OFFS(24),
104119203Seivind    DEF_ZMM_OFFS(25),
105119203Seivind    DEF_ZMM_OFFS(26),
106119203Seivind    DEF_ZMM_OFFS(27),
10715103Sphk    DEF_ZMM_OFFS(28),
10886189Srwatson    DEF_ZMM_OFFS(29),
109119203Seivind    DEF_ZMM_OFFS(30),
11015103Sphk    DEF_ZMM_OFFS(31),
111121307Ssilby    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
11246381Sbillf    fpu_stateH_end,
11315103Sphk    r15_off, r15H_off,
11486189Srwatson    r14_off, r14H_off,
11546381Sbillf    r13_off, r13H_off,
11615103Sphk    r12_off, r12H_off,
117121307Ssilby    r11_off, r11H_off,
11880418Speter    r10_off, r10H_off,
11980418Speter    r9_off,  r9H_off,
12086189Srwatson    r8_off,  r8H_off,
12146381Sbillf    rdi_off, rdiH_off,
12215103Sphk    rsi_off, rsiH_off,
12386189Srwatson    ignore_off, ignoreH_off,  // extra copy of rbp
124105046Smike    rsp_off, rspH_off,
12515103Sphk    rbx_off, rbxH_off,
12686189Srwatson    rdx_off, rdxH_off,
12746381Sbillf    rcx_off, rcxH_off,
12815103Sphk    rax_off, raxH_off,
12986189Srwatson    // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
13046381Sbillf    align_off, alignH_off,
13115103Sphk    flags_off, flagsH_off,
13215103Sphk    // The frame sender code expects that rbp will be in the "natural" place and
13386189Srwatson    // will override any oopMap setting for it. We must therefore force the layout
13446381Sbillf    // so that it agrees with the frame sender code.
13515103Sphk    rbp_off, rbpH_off,        // copy of rbp we will restore
13686189Srwatson    return_off, returnH_off,  // slot for return address
13746381Sbillf    reg_save_size             // size in compiler stack slots
13815103Sphk  };
13915103Sphk
14015103Sphk public:
14115103Sphk  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors = false);
14286189Srwatson  static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
14346381Sbillf
14415103Sphk  // Offsets into the register save area
14586189Srwatson  // Used by deoptimization when it is managing result register
14646381Sbillf  // values on its own
14715103Sphk
14886189Srwatson  static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
14946381Sbillf  static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
15015103Sphk  static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
15186189Srwatson  static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
15246381Sbillf  static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
15315103Sphk
154106605Stmm  // During deoptimization only the result registers need to be restored,
155106605Stmm  // all the other values have already been extracted.
156106605Stmm  static void restore_result_registers(MacroAssembler* masm);
157106605Stmm};
158106605Stmm
159106605StmmOopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
160106605Stmm  int vect_words = 0;
161106605Stmm  int off = 0;
162106605Stmm  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
163106605Stmm  if (UseAVX < 3) {
164106605Stmm    num_xmm_regs = num_xmm_regs/2;
165106605Stmm  }
166106605Stmm#ifdef COMPILER2
167142834Swes  if (save_vectors) {
168142834Swes    assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
169142834Swes    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
170142834Swes    // Save upper half of YMM registers
171142834Swes    vect_words = 16 * num_xmm_regs / wordSize;
172142834Swes    if (UseAVX < 3) {
173142834Swes      additional_frame_words += vect_words;
174142834Swes    }
175142834Swes  }
176106605Stmm#else
177106605Stmm  assert(!save_vectors, "vectors are generated only by C2");
178106605Stmm#endif
179106605Stmm
180106605Stmm  // Always make the frame size 16-byte aligned
181106605Stmm  int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
182106605Stmm                                     reg_save_size*BytesPerInt, num_xmm_regs);
183106605Stmm  // OopMap frame size is in compiler stack slots (jint's) not bytes or words
184106605Stmm  int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
185106605Stmm  // The caller will allocate additional_frame_words
186106605Stmm  int additional_frame_slots = additional_frame_words*wordSize / BytesPerInt;
187106605Stmm  // CodeBlob frame size is in words.
188106605Stmm  int frame_size_in_words = frame_size_in_bytes / wordSize;
18928885Skato  *total_frame_words = frame_size_in_words;
19028885Skato
19146381Sbillf  // Save registers, fpu state, and flags.
19228885Skato  // We assume caller has already pushed the return address onto the
19315103Sphk  // stack, so rsp is 8-byte aligned here.
19415103Sphk  // We push rpb twice in this sequence because we want the real rbp
19546155Sphk  // to be under the return like a normal enter.
19662573Sphk
19746155Sphk  __ enter();          // rsp becomes 16-byte aligned here
19887072Srwatson  __ push_CPU_state(); // Push a multiple of 16 bytes
19987275Srwatson
20046155Sphk  // push cpu state handles this on EVEX enabled targets
20115103Sphk  if ((vect_words > 0) && (UseAVX < 3)) {
20291406Sjhb    assert(vect_words*wordSize >= 256, "");
20387072Srwatson    // Save upper half of YMM registes(0..num_xmm_regs)
20457163Srwatson    __ subptr(rsp, num_xmm_regs*16);
20586190Srwatson    for (int n = 0; n < num_xmm_regs; n++) {
20687275Srwatson      __ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
20787275Srwatson    }
20887275Srwatson  }
20987275Srwatson  if (frame::arg_reg_save_area_bytes != 0) {
21087275Srwatson    // Allocate argument register save area
21187275Srwatson    __ subptr(rsp, frame::arg_reg_save_area_bytes);
21287275Srwatson  }
21387275Srwatson
21487275Srwatson  // Set an oopmap for the call site.  This oopmap will map all
21587275Srwatson  // oop-registers and debug-info registers as callee-saved.  This
21687072Srwatson  // will allow deoptimization at this safepoint to find all possible
21787275Srwatson  // debug-info recordings, as well as let GC find all oops.
21887275Srwatson
21987275Srwatson  OopMapSet *oop_maps = new OopMapSet();
22087275Srwatson  OopMap* map = new OopMap(frame_size_in_slots, 0);
22187275Srwatson
22287275Srwatson#define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_slots)
22387275Srwatson
22487275Srwatson  map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
22587275Srwatson  map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
22687275Srwatson  map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
22757111Srwatson  map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
22886189Srwatson  // rbp location is known implicitly by the frame sender code, needs no oopmap
22946155Sphk  // and the location where rbp was saved by is ignored
23046155Sphk  map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
23146155Sphk  map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
23246155Sphk  map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
23386189Srwatson  map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
23446155Sphk  map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
23546381Sbillf  map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
23646155Sphk  map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
23786190Srwatson  map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
23884611Srwatson  map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
23987072Srwatson  map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
24084611Srwatson  // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
24184611Srwatson  // on EVEX enabled targets, we get it included in the xsave area
24286190Srwatson  off = xmm0_off;
24384611Srwatson  int delta = xmm1_off - off;
24415103Sphk  for (int n = 0; n < 16; n++) {
245114293Smarkm    XMMRegister xmm_name = as_XMMRegister(n);
24615103Sphk    map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
24793686Sarr    off += delta;
24893686Sarr  }
24993686Sarr  if(UseAVX > 2) {
25015103Sphk    // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
25162573Sphk    off = zmm16_off;
25215103Sphk    delta = zmm17_off - off;
25386145Srwatson    for (int n = 16; n < num_xmm_regs; n++) {
25486140Srwatson      XMMRegister xmm_name = as_XMMRegister(n);
25515103Sphk      map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
25691406Sjhb      off += delta;
25786145Srwatson    }
25886140Srwatson  }
25986140Srwatson
260140676Srwatson  // %%% These should all be a waste but we'll keep things as they were for now
261141013Srwatson  if (true) {
26286140Srwatson    map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
263140676Srwatson    map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
26486145Srwatson    map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
265140676Srwatson    map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
26686140Srwatson    // rbp location is known implicitly by the frame sender code, needs no oopmap
26786140Srwatson    map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
26886140Srwatson    map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
26986140Srwatson    map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
27086140Srwatson    map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
27186140Srwatson    map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
27286140Srwatson    map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
27386140Srwatson    map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
27486145Srwatson    map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
27587275Srwatson    map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
27687072Srwatson    map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
27787275Srwatson    // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
27887275Srwatson    // on EVEX enabled targets, we get it included in the xsave area
27986140Srwatson    off = xmm0H_off;
28087275Srwatson    delta = xmm1H_off - off;
28186145Srwatson    for (int n = 0; n < 16; n++) {
28287275Srwatson      XMMRegister xmm_name = as_XMMRegister(n);
28386140Srwatson      map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
28493686Sarr      off += delta;
28587072Srwatson    }
28693686Sarr    if (UseAVX > 2) {
28793686Sarr      // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
28886140Srwatson      off = zmm16H_off;
28993686Sarr      delta = zmm17H_off - off;
29086140Srwatson      for (int n = 16; n < num_xmm_regs; n++) {
29193686Sarr        XMMRegister xmm_name = as_XMMRegister(n);
29286140Srwatson        map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
29386140Srwatson        off += delta;
29415103Sphk      }
29515103Sphk    }
29683990Srwatson  }
29783990Srwatson
29883990Srwatson  return map;
29961370Srwatson}
300169604Swkoszek
301169507Swkoszekvoid RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
302169507Swkoszek  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
303169507Swkoszek  if (UseAVX < 3) {
304169507Swkoszek    num_xmm_regs = num_xmm_regs/2;
305169507Swkoszek  }
306169507Swkoszek  if (frame::arg_reg_save_area_bytes != 0) {
307169507Swkoszek    // Pop arg register save area
308169507Swkoszek    __ addptr(rsp, frame::arg_reg_save_area_bytes);
309169507Swkoszek  }
310169507Swkoszek#ifdef COMPILER2
311169507Swkoszek  // On EVEX enabled targets everything is handled in pop fpu state
312169507Swkoszek  if ((restore_vectors) && (UseAVX < 3)) {
313169507Swkoszek    assert(UseAVX > 0, "256/512-bit vectors are supported only with AVX");
314169507Swkoszek    assert(MaxVectorSize == 64, "up to 512bit vectors are supported now");
315169507Swkoszek    int off = 0;
316169507Swkoszek    // Restore upper half of YMM registes (0..num_xmm_regs)
317169507Swkoszek    for (int n = 0; n < num_xmm_regs; n++) {
318169507Swkoszek      __ vinsertf128h(as_XMMRegister(n), Address(rsp,  off++*16));
319169507Swkoszek    }
320169507Swkoszek    __ addptr(rsp, num_xmm_regs*16);
321169507Swkoszek  }
322169507Swkoszek#else
323169507Swkoszek  assert(!restore_vectors, "vectors are generated only by C2");
324169507Swkoszek#endif
325169507Swkoszek  // Recover CPU state
326169507Swkoszek  __ pop_CPU_state();
327169507Swkoszek  // Get the rbp described implicitly by the calling convention (no oopMap)
328169507Swkoszek  __ pop(rbp);
329169507Swkoszek}
330169507Swkoszek
331169507Swkoszekvoid RegisterSaver::restore_result_registers(MacroAssembler* masm) {
332169650Simp
333169507Swkoszek  // Just restore result register. Only used by deoptimization. By
33415103Sphk  // now any callee save register that needs to be restored to a c2
33517281Swollman  // caller of the deoptee has been extracted into the vframeArray
33646381Sbillf  // and will be stuffed into the c2i adapter we create for later
33715103Sphk  // restoration so only result registers need to be restored here.
33886190Srwatson
33978609Spirzyk  // Restore fp result register
340168545Spjd  __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
341168545Spjd  // Restore integer result register
342168545Spjd  __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
34315103Sphk  __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
34415103Sphk
34515103Sphk  // Pop all of the register save are off the stack except the return address
34686189Srwatson  __ addptr(rsp, return_offset_in_bytes());
34715103Sphk}
34815103Sphk
34986189Srwatson// Is vector's size (in bytes) bigger than a size saved by default?
35046381Sbillf// 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
35186189Srwatsonbool SharedRuntime::is_wide_vector(int size) {
35246381Sbillf  return size > 16;
35386189Srwatson}
35446381Sbillf
35586189Srwatson// The java_calling_convention describes stack locations as ideal slots on
35646381Sbillf// a frame with no abi restrictions. Since we must observe abi restrictions
35786189Srwatson// (like the placement of the register window) the slots must be biased by
35846381Sbillf// the following value.
35986189Srwatsonstatic int reg2offset_in(VMReg r) {
36046381Sbillf  // Account for saved rbp and return address
36118540Sbde  // This should really be in_preserve_stack_slots
36286189Srwatson  return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
36346381Sbillf}
36486189Srwatson
36546381Sbillfstatic int reg2offset_out(VMReg r) {
36686189Srwatson  return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
36786189Srwatson}
36846381Sbillf
36986189Srwatson// ---------------------------------------------------------------------------
37046381Sbillf// Read the array of BasicTypes from a signature, and compute where the
37186189Srwatson// arguments should go.  Values in the VMRegPair regs array refer to 4-byte
37246381Sbillf// quantities.  Values less than VMRegImpl::stack0 are registers, those above
37386189Srwatson// refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
37446381Sbillf// as framesizes are fixed.
37586189Srwatson// VMRegImpl::stack0 refers to the first slot 0(sp).
37646381Sbillf// and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
37786189Srwatson// up to RegisterImpl::number_of_registers) are the 64-bit
37846381Sbillf// integer registers.
37986189Srwatson
38046381Sbillf// Note: the INPUTS in sig_bt are in units of Java argument words, which are
38186189Srwatson// either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
38246381Sbillf// units regardless of build. Of course for i486 there is no 64 bit build
38386189Srwatson
38446381Sbillf// The Java calling convention is a "shifted" version of the C ABI.
38586189Srwatson// By skipping the first C ABI register we can call non-static jni methods
38646381Sbillf// with small numbers of arguments without having to shuffle the arguments
38786189Srwatson// at all. Since we control the java ABI we ought to at least get some
38846381Sbillf// advantage out of it.
38948891Sphk
39048891Sphkint SharedRuntime::java_calling_convention(const BasicType *sig_bt,
39186189Srwatson                                           VMRegPair *regs,
39248891Sphk                                           int total_args_passed,
39348891Sphk                                           int is_outgoing) {
39486189Srwatson
39548891Sphk  // Create the mapping between argument positions and
39648927Sphk  // registers.
39760041Sphk  static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
39858926Sphk    j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
39958926Sphk  };
40058926Sphk  static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
40158926Sphk    j_farg0, j_farg1, j_farg2, j_farg3,
40258926Sphk    j_farg4, j_farg5, j_farg6, j_farg7
40372376Sjake  };
40472376Sjake
40572376Sjake
40672376Sjake  uint int_args = 0;
407108685Sjake  uint fp_args = 0;
408108696Sjake  uint stk_args = 0; // inc by 2 each time
409108696Sjake
410108696Sjake  for (int i = 0; i < total_args_passed; i++) {
411108696Sjake    switch (sig_bt[i]) {
412108696Sjake    case T_BOOLEAN:
413108696Sjake    case T_CHAR:
414    case T_BYTE:
415    case T_SHORT:
416    case T_INT:
417      if (int_args < Argument::n_int_register_parameters_j) {
418        regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
419      } else {
420        regs[i].set1(VMRegImpl::stack2reg(stk_args));
421        stk_args += 2;
422      }
423      break;
424    case T_VOID:
425      // halves of T_LONG or T_DOUBLE
426      assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
427      regs[i].set_bad();
428      break;
429    case T_LONG:
430      assert(sig_bt[i + 1] == T_VOID, "expecting half");
431      // fall through
432    case T_OBJECT:
433    case T_ARRAY:
434    case T_ADDRESS:
435      if (int_args < Argument::n_int_register_parameters_j) {
436        regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
437      } else {
438        regs[i].set2(VMRegImpl::stack2reg(stk_args));
439        stk_args += 2;
440      }
441      break;
442    case T_FLOAT:
443      if (fp_args < Argument::n_float_register_parameters_j) {
444        regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
445      } else {
446        regs[i].set1(VMRegImpl::stack2reg(stk_args));
447        stk_args += 2;
448      }
449      break;
450    case T_DOUBLE:
451      assert(sig_bt[i + 1] == T_VOID, "expecting half");
452      if (fp_args < Argument::n_float_register_parameters_j) {
453        regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
454      } else {
455        regs[i].set2(VMRegImpl::stack2reg(stk_args));
456        stk_args += 2;
457      }
458      break;
459    default:
460      ShouldNotReachHere();
461      break;
462    }
463  }
464
465  return round_to(stk_args, 2);
466}
467
468// Patch the callers callsite with entry to compiled code if it exists.
469static void patch_callers_callsite(MacroAssembler *masm) {
470  Label L;
471  __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
472  __ jcc(Assembler::equal, L);
473
474  // Save the current stack pointer
475  __ mov(r13, rsp);
476  // Schedule the branch target address early.
477  // Call into the VM to patch the caller, then jump to compiled callee
478  // rax isn't live so capture return address while we easily can
479  __ movptr(rax, Address(rsp, 0));
480
481  // align stack so push_CPU_state doesn't fault
482  __ andptr(rsp, -(StackAlignmentInBytes));
483  __ push_CPU_state();
484
485  // VM needs caller's callsite
486  // VM needs target method
487  // This needs to be a long call since we will relocate this adapter to
488  // the codeBuffer and it may not reach
489
490  // Allocate argument register save area
491  if (frame::arg_reg_save_area_bytes != 0) {
492    __ subptr(rsp, frame::arg_reg_save_area_bytes);
493  }
494  __ mov(c_rarg0, rbx);
495  __ mov(c_rarg1, rax);
496  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
497
498  // De-allocate argument register save area
499  if (frame::arg_reg_save_area_bytes != 0) {
500    __ addptr(rsp, frame::arg_reg_save_area_bytes);
501  }
502
503  __ pop_CPU_state();
504  // restore sp
505  __ mov(rsp, r13);
506  __ bind(L);
507}
508
509
510static void gen_c2i_adapter(MacroAssembler *masm,
511                            int total_args_passed,
512                            int comp_args_on_stack,
513                            const BasicType *sig_bt,
514                            const VMRegPair *regs,
515                            Label& skip_fixup) {
516  // Before we get into the guts of the C2I adapter, see if we should be here
517  // at all.  We've come from compiled code and are attempting to jump to the
518  // interpreter, which means the caller made a static call to get here
519  // (vcalls always get a compiled target if there is one).  Check for a
520  // compiled target.  If there is one, we need to patch the caller's call.
521  patch_callers_callsite(masm);
522
523  __ bind(skip_fixup);
524
525  // Since all args are passed on the stack, total_args_passed *
526  // Interpreter::stackElementSize is the space we need. Plus 1 because
527  // we also account for the return address location since
528  // we store it first rather than hold it in rax across all the shuffling
529
530  int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
531
532  // stack is aligned, keep it that way
533  extraspace = round_to(extraspace, 2*wordSize);
534
535  // Get return address
536  __ pop(rax);
537
538  // set senderSP value
539  __ mov(r13, rsp);
540
541  __ subptr(rsp, extraspace);
542
543  // Store the return address in the expected location
544  __ movptr(Address(rsp, 0), rax);
545
546  // Now write the args into the outgoing interpreter space
547  for (int i = 0; i < total_args_passed; i++) {
548    if (sig_bt[i] == T_VOID) {
549      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
550      continue;
551    }
552
553    // offset to start parameters
554    int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
555    int next_off = st_off - Interpreter::stackElementSize;
556
557    // Say 4 args:
558    // i   st_off
559    // 0   32 T_LONG
560    // 1   24 T_VOID
561    // 2   16 T_OBJECT
562    // 3    8 T_BOOL
563    // -    0 return address
564    //
565    // However to make thing extra confusing. Because we can fit a long/double in
566    // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
567    // leaves one slot empty and only stores to a single slot. In this case the
568    // slot that is occupied is the T_VOID slot. See I said it was confusing.
569
570    VMReg r_1 = regs[i].first();
571    VMReg r_2 = regs[i].second();
572    if (!r_1->is_valid()) {
573      assert(!r_2->is_valid(), "");
574      continue;
575    }
576    if (r_1->is_stack()) {
577      // memory to memory use rax
578      int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
579      if (!r_2->is_valid()) {
580        // sign extend??
581        __ movl(rax, Address(rsp, ld_off));
582        __ movptr(Address(rsp, st_off), rax);
583
584      } else {
585
586        __ movq(rax, Address(rsp, ld_off));
587
588        // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
589        // T_DOUBLE and T_LONG use two slots in the interpreter
590        if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
591          // ld_off == LSW, ld_off+wordSize == MSW
592          // st_off == MSW, next_off == LSW
593          __ movq(Address(rsp, next_off), rax);
594#ifdef ASSERT
595          // Overwrite the unused slot with known junk
596          __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
597          __ movptr(Address(rsp, st_off), rax);
598#endif /* ASSERT */
599        } else {
600          __ movq(Address(rsp, st_off), rax);
601        }
602      }
603    } else if (r_1->is_Register()) {
604      Register r = r_1->as_Register();
605      if (!r_2->is_valid()) {
606        // must be only an int (or less ) so move only 32bits to slot
607        // why not sign extend??
608        __ movl(Address(rsp, st_off), r);
609      } else {
610        // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
611        // T_DOUBLE and T_LONG use two slots in the interpreter
612        if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
613          // long/double in gpr
614#ifdef ASSERT
615          // Overwrite the unused slot with known junk
616          __ mov64(rax, CONST64(0xdeadffffdeadaaab));
617          __ movptr(Address(rsp, st_off), rax);
618#endif /* ASSERT */
619          __ movq(Address(rsp, next_off), r);
620        } else {
621          __ movptr(Address(rsp, st_off), r);
622        }
623      }
624    } else {
625      assert(r_1->is_XMMRegister(), "");
626      if (!r_2->is_valid()) {
627        // only a float use just part of the slot
628        __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
629      } else {
630#ifdef ASSERT
631        // Overwrite the unused slot with known junk
632        __ mov64(rax, CONST64(0xdeadffffdeadaaac));
633        __ movptr(Address(rsp, st_off), rax);
634#endif /* ASSERT */
635        __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
636      }
637    }
638  }
639
640  // Schedule the branch target address early.
641  __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
642  __ jmp(rcx);
643}
644
645static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
646                        address code_start, address code_end,
647                        Label& L_ok) {
648  Label L_fail;
649  __ lea(temp_reg, ExternalAddress(code_start));
650  __ cmpptr(pc_reg, temp_reg);
651  __ jcc(Assembler::belowEqual, L_fail);
652  __ lea(temp_reg, ExternalAddress(code_end));
653  __ cmpptr(pc_reg, temp_reg);
654  __ jcc(Assembler::below, L_ok);
655  __ bind(L_fail);
656}
657
658static void gen_i2c_adapter(MacroAssembler *masm,
659                            int total_args_passed,
660                            int comp_args_on_stack,
661                            const BasicType *sig_bt,
662                            const VMRegPair *regs) {
663
664  // Note: r13 contains the senderSP on entry. We must preserve it since
665  // we may do a i2c -> c2i transition if we lose a race where compiled
666  // code goes non-entrant while we get args ready.
667  // In addition we use r13 to locate all the interpreter args as
668  // we must align the stack to 16 bytes on an i2c entry else we
669  // lose alignment we expect in all compiled code and register
670  // save code can segv when fxsave instructions find improperly
671  // aligned stack pointer.
672
673  // Adapters can be frameless because they do not require the caller
674  // to perform additional cleanup work, such as correcting the stack pointer.
675  // An i2c adapter is frameless because the *caller* frame, which is interpreted,
676  // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
677  // even if a callee has modified the stack pointer.
678  // A c2i adapter is frameless because the *callee* frame, which is interpreted,
679  // routinely repairs its caller's stack pointer (from sender_sp, which is set
680  // up via the senderSP register).
681  // In other words, if *either* the caller or callee is interpreted, we can
682  // get the stack pointer repaired after a call.
683  // This is why c2i and i2c adapters cannot be indefinitely composed.
684  // In particular, if a c2i adapter were to somehow call an i2c adapter,
685  // both caller and callee would be compiled methods, and neither would
686  // clean up the stack pointer changes performed by the two adapters.
687  // If this happens, control eventually transfers back to the compiled
688  // caller, but with an uncorrected stack, causing delayed havoc.
689
690  // Pick up the return address
691  __ movptr(rax, Address(rsp, 0));
692
693  if (VerifyAdapterCalls &&
694      (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
695    // So, let's test for cascading c2i/i2c adapters right now.
696    //  assert(Interpreter::contains($return_addr) ||
697    //         StubRoutines::contains($return_addr),
698    //         "i2c adapter must return to an interpreter frame");
699    __ block_comment("verify_i2c { ");
700    Label L_ok;
701    if (Interpreter::code() != NULL)
702      range_check(masm, rax, r11,
703                  Interpreter::code()->code_start(), Interpreter::code()->code_end(),
704                  L_ok);
705    if (StubRoutines::code1() != NULL)
706      range_check(masm, rax, r11,
707                  StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
708                  L_ok);
709    if (StubRoutines::code2() != NULL)
710      range_check(masm, rax, r11,
711                  StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
712                  L_ok);
713    const char* msg = "i2c adapter must return to an interpreter frame";
714    __ block_comment(msg);
715    __ stop(msg);
716    __ bind(L_ok);
717    __ block_comment("} verify_i2ce ");
718  }
719
720  // Must preserve original SP for loading incoming arguments because
721  // we need to align the outgoing SP for compiled code.
722  __ movptr(r11, rsp);
723
724  // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
725  // in registers, we will occasionally have no stack args.
726  int comp_words_on_stack = 0;
727  if (comp_args_on_stack) {
728    // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
729    // registers are below.  By subtracting stack0, we either get a negative
730    // number (all values in registers) or the maximum stack slot accessed.
731
732    // Convert 4-byte c2 stack slots to words.
733    comp_words_on_stack = round_to(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
734    // Round up to miminum stack alignment, in wordSize
735    comp_words_on_stack = round_to(comp_words_on_stack, 2);
736    __ subptr(rsp, comp_words_on_stack * wordSize);
737  }
738
739
740  // Ensure compiled code always sees stack at proper alignment
741  __ andptr(rsp, -16);
742
743  // push the return address and misalign the stack that youngest frame always sees
744  // as far as the placement of the call instruction
745  __ push(rax);
746
747  // Put saved SP in another register
748  const Register saved_sp = rax;
749  __ movptr(saved_sp, r11);
750
751  // Will jump to the compiled code just as if compiled code was doing it.
752  // Pre-load the register-jump target early, to schedule it better.
753  __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
754
755  // Now generate the shuffle code.  Pick up all register args and move the
756  // rest through the floating point stack top.
757  for (int i = 0; i < total_args_passed; i++) {
758    if (sig_bt[i] == T_VOID) {
759      // Longs and doubles are passed in native word order, but misaligned
760      // in the 32-bit build.
761      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
762      continue;
763    }
764
765    // Pick up 0, 1 or 2 words from SP+offset.
766
767    assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
768            "scrambled load targets?");
769    // Load in argument order going down.
770    int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
771    // Point to interpreter value (vs. tag)
772    int next_off = ld_off - Interpreter::stackElementSize;
773    //
774    //
775    //
776    VMReg r_1 = regs[i].first();
777    VMReg r_2 = regs[i].second();
778    if (!r_1->is_valid()) {
779      assert(!r_2->is_valid(), "");
780      continue;
781    }
782    if (r_1->is_stack()) {
783      // Convert stack slot to an SP offset (+ wordSize to account for return address )
784      int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
785
786      // We can use r13 as a temp here because compiled code doesn't need r13 as an input
787      // and if we end up going thru a c2i because of a miss a reasonable value of r13
788      // will be generated.
789      if (!r_2->is_valid()) {
790        // sign extend???
791        __ movl(r13, Address(saved_sp, ld_off));
792        __ movptr(Address(rsp, st_off), r13);
793      } else {
794        //
795        // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
796        // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
797        // So we must adjust where to pick up the data to match the interpreter.
798        //
799        // Interpreter local[n] == MSW, local[n+1] == LSW however locals
800        // are accessed as negative so LSW is at LOW address
801
802        // ld_off is MSW so get LSW
803        const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
804                           next_off : ld_off;
805        __ movq(r13, Address(saved_sp, offset));
806        // st_off is LSW (i.e. reg.first())
807        __ movq(Address(rsp, st_off), r13);
808      }
809    } else if (r_1->is_Register()) {  // Register argument
810      Register r = r_1->as_Register();
811      assert(r != rax, "must be different");
812      if (r_2->is_valid()) {
813        //
814        // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
815        // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
816        // So we must adjust where to pick up the data to match the interpreter.
817
818        const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
819                           next_off : ld_off;
820
821        // this can be a misaligned move
822        __ movq(r, Address(saved_sp, offset));
823      } else {
824        // sign extend and use a full word?
825        __ movl(r, Address(saved_sp, ld_off));
826      }
827    } else {
828      if (!r_2->is_valid()) {
829        __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
830      } else {
831        __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
832      }
833    }
834  }
835
836  // 6243940 We might end up in handle_wrong_method if
837  // the callee is deoptimized as we race thru here. If that
838  // happens we don't want to take a safepoint because the
839  // caller frame will look interpreted and arguments are now
840  // "compiled" so it is much better to make this transition
841  // invisible to the stack walking code. Unfortunately if
842  // we try and find the callee by normal means a safepoint
843  // is possible. So we stash the desired callee in the thread
844  // and the vm will find there should this case occur.
845
846  __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
847
848  // put Method* where a c2i would expect should we end up there
849  // only needed becaus eof c2 resolve stubs return Method* as a result in
850  // rax
851  __ mov(rax, rbx);
852  __ jmp(r11);
853}
854
855// ---------------------------------------------------------------
856AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
857                                                            int total_args_passed,
858                                                            int comp_args_on_stack,
859                                                            const BasicType *sig_bt,
860                                                            const VMRegPair *regs,
861                                                            AdapterFingerPrint* fingerprint) {
862  address i2c_entry = __ pc();
863
864  gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
865
866  // -------------------------------------------------------------------------
867  // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
868  // to the interpreter.  The args start out packed in the compiled layout.  They
869  // need to be unpacked into the interpreter layout.  This will almost always
870  // require some stack space.  We grow the current (compiled) stack, then repack
871  // the args.  We  finally end in a jump to the generic interpreter entry point.
872  // On exit from the interpreter, the interpreter will restore our SP (lest the
873  // compiled code, which relys solely on SP and not RBP, get sick).
874
875  address c2i_unverified_entry = __ pc();
876  Label skip_fixup;
877  Label ok;
878
879  Register holder = rax;
880  Register receiver = j_rarg0;
881  Register temp = rbx;
882
883  {
884    __ load_klass(temp, receiver);
885    __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
886    __ movptr(rbx, Address(holder, CompiledICHolder::holder_method_offset()));
887    __ jcc(Assembler::equal, ok);
888    __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
889
890    __ bind(ok);
891    // Method might have been compiled since the call site was patched to
892    // interpreted if that is the case treat it as a miss so we can get
893    // the call site corrected.
894    __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
895    __ jcc(Assembler::equal, skip_fixup);
896    __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
897  }
898
899  address c2i_entry = __ pc();
900
901  gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
902
903  __ flush();
904  return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry);
905}
906
907int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
908                                         VMRegPair *regs,
909                                         VMRegPair *regs2,
910                                         int total_args_passed) {
911  assert(regs2 == NULL, "not needed on x86");
912// We return the amount of VMRegImpl stack slots we need to reserve for all
913// the arguments NOT counting out_preserve_stack_slots.
914
915// NOTE: These arrays will have to change when c1 is ported
916#ifdef _WIN64
917    static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
918      c_rarg0, c_rarg1, c_rarg2, c_rarg3
919    };
920    static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
921      c_farg0, c_farg1, c_farg2, c_farg3
922    };
923#else
924    static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
925      c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
926    };
927    static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
928      c_farg0, c_farg1, c_farg2, c_farg3,
929      c_farg4, c_farg5, c_farg6, c_farg7
930    };
931#endif // _WIN64
932
933
934    uint int_args = 0;
935    uint fp_args = 0;
936    uint stk_args = 0; // inc by 2 each time
937
938    for (int i = 0; i < total_args_passed; i++) {
939      switch (sig_bt[i]) {
940      case T_BOOLEAN:
941      case T_CHAR:
942      case T_BYTE:
943      case T_SHORT:
944      case T_INT:
945        if (int_args < Argument::n_int_register_parameters_c) {
946          regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
947#ifdef _WIN64
948          fp_args++;
949          // Allocate slots for callee to stuff register args the stack.
950          stk_args += 2;
951#endif
952        } else {
953          regs[i].set1(VMRegImpl::stack2reg(stk_args));
954          stk_args += 2;
955        }
956        break;
957      case T_LONG:
958        assert(sig_bt[i + 1] == T_VOID, "expecting half");
959        // fall through
960      case T_OBJECT:
961      case T_ARRAY:
962      case T_ADDRESS:
963      case T_METADATA:
964        if (int_args < Argument::n_int_register_parameters_c) {
965          regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
966#ifdef _WIN64
967          fp_args++;
968          stk_args += 2;
969#endif
970        } else {
971          regs[i].set2(VMRegImpl::stack2reg(stk_args));
972          stk_args += 2;
973        }
974        break;
975      case T_FLOAT:
976        if (fp_args < Argument::n_float_register_parameters_c) {
977          regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
978#ifdef _WIN64
979          int_args++;
980          // Allocate slots for callee to stuff register args the stack.
981          stk_args += 2;
982#endif
983        } else {
984          regs[i].set1(VMRegImpl::stack2reg(stk_args));
985          stk_args += 2;
986        }
987        break;
988      case T_DOUBLE:
989        assert(sig_bt[i + 1] == T_VOID, "expecting half");
990        if (fp_args < Argument::n_float_register_parameters_c) {
991          regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
992#ifdef _WIN64
993          int_args++;
994          // Allocate slots for callee to stuff register args the stack.
995          stk_args += 2;
996#endif
997        } else {
998          regs[i].set2(VMRegImpl::stack2reg(stk_args));
999          stk_args += 2;
1000        }
1001        break;
1002      case T_VOID: // Halves of longs and doubles
1003        assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1004        regs[i].set_bad();
1005        break;
1006      default:
1007        ShouldNotReachHere();
1008        break;
1009      }
1010    }
1011#ifdef _WIN64
1012  // windows abi requires that we always allocate enough stack space
1013  // for 4 64bit registers to be stored down.
1014  if (stk_args < 8) {
1015    stk_args = 8;
1016  }
1017#endif // _WIN64
1018
1019  return stk_args;
1020}
1021
1022// On 64 bit we will store integer like items to the stack as
1023// 64 bits items (sparc abi) even though java would only store
1024// 32bits for a parameter. On 32bit it will simply be 32 bits
1025// So this routine will do 32->32 on 32bit and 32->64 on 64bit
1026static void move32_64(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1027  if (src.first()->is_stack()) {
1028    if (dst.first()->is_stack()) {
1029      // stack to stack
1030      __ movslq(rax, Address(rbp, reg2offset_in(src.first())));
1031      __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
1032    } else {
1033      // stack to reg
1034      __ movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1035    }
1036  } else if (dst.first()->is_stack()) {
1037    // reg to stack
1038    // Do we really have to sign extend???
1039    // __ movslq(src.first()->as_Register(), src.first()->as_Register());
1040    __ movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1041  } else {
1042    // Do we really have to sign extend???
1043    // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1044    if (dst.first() != src.first()) {
1045      __ movq(dst.first()->as_Register(), src.first()->as_Register());
1046    }
1047  }
1048}
1049
1050static void move_ptr(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1051  if (src.first()->is_stack()) {
1052    if (dst.first()->is_stack()) {
1053      // stack to stack
1054      __ movq(rax, Address(rbp, reg2offset_in(src.first())));
1055      __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
1056    } else {
1057      // stack to reg
1058      __ movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1059    }
1060  } else if (dst.first()->is_stack()) {
1061    // reg to stack
1062    __ movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1063  } else {
1064    if (dst.first() != src.first()) {
1065      __ movq(dst.first()->as_Register(), src.first()->as_Register());
1066    }
1067  }
1068}
1069
1070// An oop arg. Must pass a handle not the oop itself
1071static void object_move(MacroAssembler* masm,
1072                        OopMap* map,
1073                        int oop_handle_offset,
1074                        int framesize_in_slots,
1075                        VMRegPair src,
1076                        VMRegPair dst,
1077                        bool is_receiver,
1078                        int* receiver_offset) {
1079
1080  // must pass a handle. First figure out the location we use as a handle
1081
1082  Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1083
1084  // See if oop is NULL if it is we need no handle
1085
1086  if (src.first()->is_stack()) {
1087
1088    // Oop is already on the stack as an argument
1089    int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1090    map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1091    if (is_receiver) {
1092      *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1093    }
1094
1095    __ cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
1096    __ lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1097    // conditionally move a NULL
1098    __ cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1099  } else {
1100
1101    // Oop is in an a register we must store it to the space we reserve
1102    // on the stack for oop_handles and pass a handle if oop is non-NULL
1103
1104    const Register rOop = src.first()->as_Register();
1105    int oop_slot;
1106    if (rOop == j_rarg0)
1107      oop_slot = 0;
1108    else if (rOop == j_rarg1)
1109      oop_slot = 1;
1110    else if (rOop == j_rarg2)
1111      oop_slot = 2;
1112    else if (rOop == j_rarg3)
1113      oop_slot = 3;
1114    else if (rOop == j_rarg4)
1115      oop_slot = 4;
1116    else {
1117      assert(rOop == j_rarg5, "wrong register");
1118      oop_slot = 5;
1119    }
1120
1121    oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1122    int offset = oop_slot*VMRegImpl::stack_slot_size;
1123
1124    map->set_oop(VMRegImpl::stack2reg(oop_slot));
1125    // Store oop in handle area, may be NULL
1126    __ movptr(Address(rsp, offset), rOop);
1127    if (is_receiver) {
1128      *receiver_offset = offset;
1129    }
1130
1131    __ cmpptr(rOop, (int32_t)NULL_WORD);
1132    __ lea(rHandle, Address(rsp, offset));
1133    // conditionally move a NULL from the handle area where it was just stored
1134    __ cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1135  }
1136
1137  // If arg is on the stack then place it otherwise it is already in correct reg.
1138  if (dst.first()->is_stack()) {
1139    __ movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1140  }
1141}
1142
1143// A float arg may have to do float reg int reg conversion
1144static void float_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1145  assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
1146
1147  // The calling conventions assures us that each VMregpair is either
1148  // all really one physical register or adjacent stack slots.
1149  // This greatly simplifies the cases here compared to sparc.
1150
1151  if (src.first()->is_stack()) {
1152    if (dst.first()->is_stack()) {
1153      __ movl(rax, Address(rbp, reg2offset_in(src.first())));
1154      __ movptr(Address(rsp, reg2offset_out(dst.first())), rax);
1155    } else {
1156      // stack to reg
1157      assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1158      __ movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first())));
1159    }
1160  } else if (dst.first()->is_stack()) {
1161    // reg to stack
1162    assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1163    __ movflt(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1164  } else {
1165    // reg to reg
1166    // In theory these overlap but the ordering is such that this is likely a nop
1167    if ( src.first() != dst.first()) {
1168      __ movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
1169    }
1170  }
1171}
1172
1173// A long move
1174static void long_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1175
1176  // The calling conventions assures us that each VMregpair is either
1177  // all really one physical register or adjacent stack slots.
1178  // This greatly simplifies the cases here compared to sparc.
1179
1180  if (src.is_single_phys_reg() ) {
1181    if (dst.is_single_phys_reg()) {
1182      if (dst.first() != src.first()) {
1183        __ mov(dst.first()->as_Register(), src.first()->as_Register());
1184      }
1185    } else {
1186      assert(dst.is_single_reg(), "not a stack pair");
1187      __ movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1188    }
1189  } else if (dst.is_single_phys_reg()) {
1190    assert(src.is_single_reg(),  "not a stack pair");
1191    __ movq(dst.first()->as_Register(), Address(rbp, reg2offset_out(src.first())));
1192  } else {
1193    assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
1194    __ movq(rax, Address(rbp, reg2offset_in(src.first())));
1195    __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
1196  }
1197}
1198
1199// A double move
1200static void double_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1201
1202  // The calling conventions assures us that each VMregpair is either
1203  // all really one physical register or adjacent stack slots.
1204  // This greatly simplifies the cases here compared to sparc.
1205
1206  if (src.is_single_phys_reg() ) {
1207    if (dst.is_single_phys_reg()) {
1208      // In theory these overlap but the ordering is such that this is likely a nop
1209      if ( src.first() != dst.first()) {
1210        __ movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
1211      }
1212    } else {
1213      assert(dst.is_single_reg(), "not a stack pair");
1214      __ movdbl(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1215    }
1216  } else if (dst.is_single_phys_reg()) {
1217    assert(src.is_single_reg(),  "not a stack pair");
1218    __ movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_out(src.first())));
1219  } else {
1220    assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
1221    __ movq(rax, Address(rbp, reg2offset_in(src.first())));
1222    __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
1223  }
1224}
1225
1226
1227void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1228  // We always ignore the frame_slots arg and just use the space just below frame pointer
1229  // which by this time is free to use
1230  switch (ret_type) {
1231  case T_FLOAT:
1232    __ movflt(Address(rbp, -wordSize), xmm0);
1233    break;
1234  case T_DOUBLE:
1235    __ movdbl(Address(rbp, -wordSize), xmm0);
1236    break;
1237  case T_VOID:  break;
1238  default: {
1239    __ movptr(Address(rbp, -wordSize), rax);
1240    }
1241  }
1242}
1243
1244void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1245  // We always ignore the frame_slots arg and just use the space just below frame pointer
1246  // which by this time is free to use
1247  switch (ret_type) {
1248  case T_FLOAT:
1249    __ movflt(xmm0, Address(rbp, -wordSize));
1250    break;
1251  case T_DOUBLE:
1252    __ movdbl(xmm0, Address(rbp, -wordSize));
1253    break;
1254  case T_VOID:  break;
1255  default: {
1256    __ movptr(rax, Address(rbp, -wordSize));
1257    }
1258  }
1259}
1260
1261static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1262    for ( int i = first_arg ; i < arg_count ; i++ ) {
1263      if (args[i].first()->is_Register()) {
1264        __ push(args[i].first()->as_Register());
1265      } else if (args[i].first()->is_XMMRegister()) {
1266        __ subptr(rsp, 2*wordSize);
1267        __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1268      }
1269    }
1270}
1271
1272static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1273    for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1274      if (args[i].first()->is_Register()) {
1275        __ pop(args[i].first()->as_Register());
1276      } else if (args[i].first()->is_XMMRegister()) {
1277        __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1278        __ addptr(rsp, 2*wordSize);
1279      }
1280    }
1281}
1282
1283
1284static void save_or_restore_arguments(MacroAssembler* masm,
1285                                      const int stack_slots,
1286                                      const int total_in_args,
1287                                      const int arg_save_area,
1288                                      OopMap* map,
1289                                      VMRegPair* in_regs,
1290                                      BasicType* in_sig_bt) {
1291  // if map is non-NULL then the code should store the values,
1292  // otherwise it should load them.
1293  int slot = arg_save_area;
1294  // Save down double word first
1295  for ( int i = 0; i < total_in_args; i++) {
1296    if (in_regs[i].first()->is_XMMRegister() && in_sig_bt[i] == T_DOUBLE) {
1297      int offset = slot * VMRegImpl::stack_slot_size;
1298      slot += VMRegImpl::slots_per_word;
1299      assert(slot <= stack_slots, "overflow");
1300      if (map != NULL) {
1301        __ movdbl(Address(rsp, offset), in_regs[i].first()->as_XMMRegister());
1302      } else {
1303        __ movdbl(in_regs[i].first()->as_XMMRegister(), Address(rsp, offset));
1304      }
1305    }
1306    if (in_regs[i].first()->is_Register() &&
1307        (in_sig_bt[i] == T_LONG || in_sig_bt[i] == T_ARRAY)) {
1308      int offset = slot * VMRegImpl::stack_slot_size;
1309      if (map != NULL) {
1310        __ movq(Address(rsp, offset), in_regs[i].first()->as_Register());
1311        if (in_sig_bt[i] == T_ARRAY) {
1312          map->set_oop(VMRegImpl::stack2reg(slot));;
1313        }
1314      } else {
1315        __ movq(in_regs[i].first()->as_Register(), Address(rsp, offset));
1316      }
1317      slot += VMRegImpl::slots_per_word;
1318    }
1319  }
1320  // Save or restore single word registers
1321  for ( int i = 0; i < total_in_args; i++) {
1322    if (in_regs[i].first()->is_Register()) {
1323      int offset = slot * VMRegImpl::stack_slot_size;
1324      slot++;
1325      assert(slot <= stack_slots, "overflow");
1326
1327      // Value is in an input register pass we must flush it to the stack
1328      const Register reg = in_regs[i].first()->as_Register();
1329      switch (in_sig_bt[i]) {
1330        case T_BOOLEAN:
1331        case T_CHAR:
1332        case T_BYTE:
1333        case T_SHORT:
1334        case T_INT:
1335          if (map != NULL) {
1336            __ movl(Address(rsp, offset), reg);
1337          } else {
1338            __ movl(reg, Address(rsp, offset));
1339          }
1340          break;
1341        case T_ARRAY:
1342        case T_LONG:
1343          // handled above
1344          break;
1345        case T_OBJECT:
1346        default: ShouldNotReachHere();
1347      }
1348    } else if (in_regs[i].first()->is_XMMRegister()) {
1349      if (in_sig_bt[i] == T_FLOAT) {
1350        int offset = slot * VMRegImpl::stack_slot_size;
1351        slot++;
1352        assert(slot <= stack_slots, "overflow");
1353        if (map != NULL) {
1354          __ movflt(Address(rsp, offset), in_regs[i].first()->as_XMMRegister());
1355        } else {
1356          __ movflt(in_regs[i].first()->as_XMMRegister(), Address(rsp, offset));
1357        }
1358      }
1359    } else if (in_regs[i].first()->is_stack()) {
1360      if (in_sig_bt[i] == T_ARRAY && map != NULL) {
1361        int offset_in_older_frame = in_regs[i].first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1362        map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + stack_slots));
1363      }
1364    }
1365  }
1366}
1367
1368
1369// Check GC_locker::needs_gc and enter the runtime if it's true.  This
1370// keeps a new JNI critical region from starting until a GC has been
1371// forced.  Save down any oops in registers and describe them in an
1372// OopMap.
1373static void check_needs_gc_for_critical_native(MacroAssembler* masm,
1374                                               int stack_slots,
1375                                               int total_c_args,
1376                                               int total_in_args,
1377                                               int arg_save_area,
1378                                               OopMapSet* oop_maps,
1379                                               VMRegPair* in_regs,
1380                                               BasicType* in_sig_bt) {
1381  __ block_comment("check GC_locker::needs_gc");
1382  Label cont;
1383  __ cmp8(ExternalAddress((address)GC_locker::needs_gc_address()), false);
1384  __ jcc(Assembler::equal, cont);
1385
1386  // Save down any incoming oops and call into the runtime to halt for a GC
1387
1388  OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1389  save_or_restore_arguments(masm, stack_slots, total_in_args,
1390                            arg_save_area, map, in_regs, in_sig_bt);
1391
1392  address the_pc = __ pc();
1393  oop_maps->add_gc_map( __ offset(), map);
1394  __ set_last_Java_frame(rsp, noreg, the_pc);
1395
1396  __ block_comment("block_for_jni_critical");
1397  __ movptr(c_rarg0, r15_thread);
1398  __ mov(r12, rsp); // remember sp
1399  __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1400  __ andptr(rsp, -16); // align stack as required by ABI
1401  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::block_for_jni_critical)));
1402  __ mov(rsp, r12); // restore sp
1403  __ reinit_heapbase();
1404
1405  __ reset_last_Java_frame(false, true);
1406
1407  save_or_restore_arguments(masm, stack_slots, total_in_args,
1408                            arg_save_area, NULL, in_regs, in_sig_bt);
1409
1410  __ bind(cont);
1411#ifdef ASSERT
1412  if (StressCriticalJNINatives) {
1413    // Stress register saving
1414    OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1415    save_or_restore_arguments(masm, stack_slots, total_in_args,
1416                              arg_save_area, map, in_regs, in_sig_bt);
1417    // Destroy argument registers
1418    for (int i = 0; i < total_in_args - 1; i++) {
1419      if (in_regs[i].first()->is_Register()) {
1420        const Register reg = in_regs[i].first()->as_Register();
1421        __ xorptr(reg, reg);
1422      } else if (in_regs[i].first()->is_XMMRegister()) {
1423        __ xorpd(in_regs[i].first()->as_XMMRegister(), in_regs[i].first()->as_XMMRegister());
1424      } else if (in_regs[i].first()->is_FloatRegister()) {
1425        ShouldNotReachHere();
1426      } else if (in_regs[i].first()->is_stack()) {
1427        // Nothing to do
1428      } else {
1429        ShouldNotReachHere();
1430      }
1431      if (in_sig_bt[i] == T_LONG || in_sig_bt[i] == T_DOUBLE) {
1432        i++;
1433      }
1434    }
1435
1436    save_or_restore_arguments(masm, stack_slots, total_in_args,
1437                              arg_save_area, NULL, in_regs, in_sig_bt);
1438  }
1439#endif
1440}
1441
1442// Unpack an array argument into a pointer to the body and the length
1443// if the array is non-null, otherwise pass 0 for both.
1444static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) {
1445  Register tmp_reg = rax;
1446  assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg,
1447         "possible collision");
1448  assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg,
1449         "possible collision");
1450
1451  __ block_comment("unpack_array_argument {");
1452
1453  // Pass the length, ptr pair
1454  Label is_null, done;
1455  VMRegPair tmp;
1456  tmp.set_ptr(tmp_reg->as_VMReg());
1457  if (reg.first()->is_stack()) {
1458    // Load the arg up from the stack
1459    move_ptr(masm, reg, tmp);
1460    reg = tmp;
1461  }
1462  __ testptr(reg.first()->as_Register(), reg.first()->as_Register());
1463  __ jccb(Assembler::equal, is_null);
1464  __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1465  move_ptr(masm, tmp, body_arg);
1466  // load the length relative to the body.
1467  __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() -
1468                           arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1469  move32_64(masm, tmp, length_arg);
1470  __ jmpb(done);
1471  __ bind(is_null);
1472  // Pass zeros
1473  __ xorptr(tmp_reg, tmp_reg);
1474  move_ptr(masm, tmp, body_arg);
1475  move32_64(masm, tmp, length_arg);
1476  __ bind(done);
1477
1478  __ block_comment("} unpack_array_argument");
1479}
1480
1481
1482// Different signatures may require very different orders for the move
1483// to avoid clobbering other arguments.  There's no simple way to
1484// order them safely.  Compute a safe order for issuing stores and
1485// break any cycles in those stores.  This code is fairly general but
1486// it's not necessary on the other platforms so we keep it in the
1487// platform dependent code instead of moving it into a shared file.
1488// (See bugs 7013347 & 7145024.)
1489// Note that this code is specific to LP64.
1490class ComputeMoveOrder: public StackObj {
1491  class MoveOperation: public ResourceObj {
1492    friend class ComputeMoveOrder;
1493   private:
1494    VMRegPair        _src;
1495    VMRegPair        _dst;
1496    int              _src_index;
1497    int              _dst_index;
1498    bool             _processed;
1499    MoveOperation*  _next;
1500    MoveOperation*  _prev;
1501
1502    static int get_id(VMRegPair r) {
1503      return r.first()->value();
1504    }
1505
1506   public:
1507    MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1508      _src(src)
1509    , _src_index(src_index)
1510    , _dst(dst)
1511    , _dst_index(dst_index)
1512    , _next(NULL)
1513    , _prev(NULL)
1514    , _processed(false) {
1515    }
1516
1517    VMRegPair src() const              { return _src; }
1518    int src_id() const                 { return get_id(src()); }
1519    int src_index() const              { return _src_index; }
1520    VMRegPair dst() const              { return _dst; }
1521    void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1522    int dst_index() const              { return _dst_index; }
1523    int dst_id() const                 { return get_id(dst()); }
1524    MoveOperation* next() const       { return _next; }
1525    MoveOperation* prev() const       { return _prev; }
1526    void set_processed()               { _processed = true; }
1527    bool is_processed() const          { return _processed; }
1528
1529    // insert
1530    void break_cycle(VMRegPair temp_register) {
1531      // create a new store following the last store
1532      // to move from the temp_register to the original
1533      MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1534
1535      // break the cycle of links and insert new_store at the end
1536      // break the reverse link.
1537      MoveOperation* p = prev();
1538      assert(p->next() == this, "must be");
1539      _prev = NULL;
1540      p->_next = new_store;
1541      new_store->_prev = p;
1542
1543      // change the original store to save it's value in the temp.
1544      set_dst(-1, temp_register);
1545    }
1546
1547    void link(GrowableArray<MoveOperation*>& killer) {
1548      // link this store in front the store that it depends on
1549      MoveOperation* n = killer.at_grow(src_id(), NULL);
1550      if (n != NULL) {
1551        assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1552        _next = n;
1553        n->_prev = this;
1554      }
1555    }
1556  };
1557
1558 private:
1559  GrowableArray<MoveOperation*> edges;
1560
1561 public:
1562  ComputeMoveOrder(int total_in_args, VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1563                    BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1564    // Move operations where the dest is the stack can all be
1565    // scheduled first since they can't interfere with the other moves.
1566    for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1567      if (in_sig_bt[i] == T_ARRAY) {
1568        c_arg--;
1569        if (out_regs[c_arg].first()->is_stack() &&
1570            out_regs[c_arg + 1].first()->is_stack()) {
1571          arg_order.push(i);
1572          arg_order.push(c_arg);
1573        } else {
1574          if (out_regs[c_arg].first()->is_stack() ||
1575              in_regs[i].first() == out_regs[c_arg].first()) {
1576            add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1577          } else {
1578            add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1579          }
1580        }
1581      } else if (in_sig_bt[i] == T_VOID) {
1582        arg_order.push(i);
1583        arg_order.push(c_arg);
1584      } else {
1585        if (out_regs[c_arg].first()->is_stack() ||
1586            in_regs[i].first() == out_regs[c_arg].first()) {
1587          arg_order.push(i);
1588          arg_order.push(c_arg);
1589        } else {
1590          add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1591        }
1592      }
1593    }
1594    // Break any cycles in the register moves and emit the in the
1595    // proper order.
1596    GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1597    for (int i = 0; i < stores->length(); i++) {
1598      arg_order.push(stores->at(i)->src_index());
1599      arg_order.push(stores->at(i)->dst_index());
1600    }
1601 }
1602
1603  // Collected all the move operations
1604  void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1605    if (src.first() == dst.first()) return;
1606    edges.append(new MoveOperation(src_index, src, dst_index, dst));
1607  }
1608
1609  // Walk the edges breaking cycles between moves.  The result list
1610  // can be walked in order to produce the proper set of loads
1611  GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1612    // Record which moves kill which values
1613    GrowableArray<MoveOperation*> killer;
1614    for (int i = 0; i < edges.length(); i++) {
1615      MoveOperation* s = edges.at(i);
1616      assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1617      killer.at_put_grow(s->dst_id(), s, NULL);
1618    }
1619    assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1620           "make sure temp isn't in the registers that are killed");
1621
1622    // create links between loads and stores
1623    for (int i = 0; i < edges.length(); i++) {
1624      edges.at(i)->link(killer);
1625    }
1626
1627    // at this point, all the move operations are chained together
1628    // in a doubly linked list.  Processing it backwards finds
1629    // the beginning of the chain, forwards finds the end.  If there's
1630    // a cycle it can be broken at any point,  so pick an edge and walk
1631    // backward until the list ends or we end where we started.
1632    GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1633    for (int e = 0; e < edges.length(); e++) {
1634      MoveOperation* s = edges.at(e);
1635      if (!s->is_processed()) {
1636        MoveOperation* start = s;
1637        // search for the beginning of the chain or cycle
1638        while (start->prev() != NULL && start->prev() != s) {
1639          start = start->prev();
1640        }
1641        if (start->prev() == s) {
1642          start->break_cycle(temp_register);
1643        }
1644        // walk the chain forward inserting to store list
1645        while (start != NULL) {
1646          stores->append(start);
1647          start->set_processed();
1648          start = start->next();
1649        }
1650      }
1651    }
1652    return stores;
1653  }
1654};
1655
1656static void verify_oop_args(MacroAssembler* masm,
1657                            methodHandle method,
1658                            const BasicType* sig_bt,
1659                            const VMRegPair* regs) {
1660  Register temp_reg = rbx;  // not part of any compiled calling seq
1661  if (VerifyOops) {
1662    for (int i = 0; i < method->size_of_parameters(); i++) {
1663      if (sig_bt[i] == T_OBJECT ||
1664          sig_bt[i] == T_ARRAY) {
1665        VMReg r = regs[i].first();
1666        assert(r->is_valid(), "bad oop arg");
1667        if (r->is_stack()) {
1668          __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1669          __ verify_oop(temp_reg);
1670        } else {
1671          __ verify_oop(r->as_Register());
1672        }
1673      }
1674    }
1675  }
1676}
1677
1678static void gen_special_dispatch(MacroAssembler* masm,
1679                                 methodHandle method,
1680                                 const BasicType* sig_bt,
1681                                 const VMRegPair* regs) {
1682  verify_oop_args(masm, method, sig_bt, regs);
1683  vmIntrinsics::ID iid = method->intrinsic_id();
1684
1685  // Now write the args into the outgoing interpreter space
1686  bool     has_receiver   = false;
1687  Register receiver_reg   = noreg;
1688  int      member_arg_pos = -1;
1689  Register member_reg     = noreg;
1690  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1691  if (ref_kind != 0) {
1692    member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1693    member_reg = rbx;  // known to be free at this point
1694    has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1695  } else if (iid == vmIntrinsics::_invokeBasic) {
1696    has_receiver = true;
1697  } else {
1698    fatal("unexpected intrinsic id %d", iid);
1699  }
1700
1701  if (member_reg != noreg) {
1702    // Load the member_arg into register, if necessary.
1703    SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1704    VMReg r = regs[member_arg_pos].first();
1705    if (r->is_stack()) {
1706      __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1707    } else {
1708      // no data motion is needed
1709      member_reg = r->as_Register();
1710    }
1711  }
1712
1713  if (has_receiver) {
1714    // Make sure the receiver is loaded into a register.
1715    assert(method->size_of_parameters() > 0, "oob");
1716    assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1717    VMReg r = regs[0].first();
1718    assert(r->is_valid(), "bad receiver arg");
1719    if (r->is_stack()) {
1720      // Porting note:  This assumes that compiled calling conventions always
1721      // pass the receiver oop in a register.  If this is not true on some
1722      // platform, pick a temp and load the receiver from stack.
1723      fatal("receiver always in a register");
1724      receiver_reg = j_rarg0;  // known to be free at this point
1725      __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1726    } else {
1727      // no data motion is needed
1728      receiver_reg = r->as_Register();
1729    }
1730  }
1731
1732  // Figure out which address we are really jumping to:
1733  MethodHandles::generate_method_handle_dispatch(masm, iid,
1734                                                 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1735}
1736
1737// ---------------------------------------------------------------------------
1738// Generate a native wrapper for a given method.  The method takes arguments
1739// in the Java compiled code convention, marshals them to the native
1740// convention (handlizes oops, etc), transitions to native, makes the call,
1741// returns to java state (possibly blocking), unhandlizes any result and
1742// returns.
1743//
1744// Critical native functions are a shorthand for the use of
1745// GetPrimtiveArrayCritical and disallow the use of any other JNI
1746// functions.  The wrapper is expected to unpack the arguments before
1747// passing them to the callee and perform checks before and after the
1748// native call to ensure that they GC_locker
1749// lock_critical/unlock_critical semantics are followed.  Some other
1750// parts of JNI setup are skipped like the tear down of the JNI handle
1751// block and the check for pending exceptions it's impossible for them
1752// to be thrown.
1753//
1754// They are roughly structured like this:
1755//    if (GC_locker::needs_gc())
1756//      SharedRuntime::block_for_jni_critical();
1757//    tranistion to thread_in_native
1758//    unpack arrray arguments and call native entry point
1759//    check for safepoint in progress
1760//    check if any thread suspend flags are set
1761//      call into JVM and possible unlock the JNI critical
1762//      if a GC was suppressed while in the critical native.
1763//    transition back to thread_in_Java
1764//    return to caller
1765//
1766nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1767                                                methodHandle method,
1768                                                int compile_id,
1769                                                BasicType* in_sig_bt,
1770                                                VMRegPair* in_regs,
1771                                                BasicType ret_type) {
1772  if (method->is_method_handle_intrinsic()) {
1773    vmIntrinsics::ID iid = method->intrinsic_id();
1774    intptr_t start = (intptr_t)__ pc();
1775    int vep_offset = ((intptr_t)__ pc()) - start;
1776    gen_special_dispatch(masm,
1777                         method,
1778                         in_sig_bt,
1779                         in_regs);
1780    int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1781    __ flush();
1782    int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1783    return nmethod::new_native_nmethod(method,
1784                                       compile_id,
1785                                       masm->code(),
1786                                       vep_offset,
1787                                       frame_complete,
1788                                       stack_slots / VMRegImpl::slots_per_word,
1789                                       in_ByteSize(-1),
1790                                       in_ByteSize(-1),
1791                                       (OopMapSet*)NULL);
1792  }
1793  bool is_critical_native = true;
1794  address native_func = method->critical_native_function();
1795  if (native_func == NULL) {
1796    native_func = method->native_function();
1797    is_critical_native = false;
1798  }
1799  assert(native_func != NULL, "must have function");
1800
1801  // An OopMap for lock (and class if static)
1802  OopMapSet *oop_maps = new OopMapSet();
1803  intptr_t start = (intptr_t)__ pc();
1804
1805  // We have received a description of where all the java arg are located
1806  // on entry to the wrapper. We need to convert these args to where
1807  // the jni function will expect them. To figure out where they go
1808  // we convert the java signature to a C signature by inserting
1809  // the hidden arguments as arg[0] and possibly arg[1] (static method)
1810
1811  const int total_in_args = method->size_of_parameters();
1812  int total_c_args = total_in_args;
1813  if (!is_critical_native) {
1814    total_c_args += 1;
1815    if (method->is_static()) {
1816      total_c_args++;
1817    }
1818  } else {
1819    for (int i = 0; i < total_in_args; i++) {
1820      if (in_sig_bt[i] == T_ARRAY) {
1821        total_c_args++;
1822      }
1823    }
1824  }
1825
1826  BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1827  VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1828  BasicType* in_elem_bt = NULL;
1829
1830  int argc = 0;
1831  if (!is_critical_native) {
1832    out_sig_bt[argc++] = T_ADDRESS;
1833    if (method->is_static()) {
1834      out_sig_bt[argc++] = T_OBJECT;
1835    }
1836
1837    for (int i = 0; i < total_in_args ; i++ ) {
1838      out_sig_bt[argc++] = in_sig_bt[i];
1839    }
1840  } else {
1841    Thread* THREAD = Thread::current();
1842    in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
1843    SignatureStream ss(method->signature());
1844    for (int i = 0; i < total_in_args ; i++ ) {
1845      if (in_sig_bt[i] == T_ARRAY) {
1846        // Arrays are passed as int, elem* pair
1847        out_sig_bt[argc++] = T_INT;
1848        out_sig_bt[argc++] = T_ADDRESS;
1849        Symbol* atype = ss.as_symbol(CHECK_NULL);
1850        const char* at = atype->as_C_string();
1851        if (strlen(at) == 2) {
1852          assert(at[0] == '[', "must be");
1853          switch (at[1]) {
1854            case 'B': in_elem_bt[i]  = T_BYTE; break;
1855            case 'C': in_elem_bt[i]  = T_CHAR; break;
1856            case 'D': in_elem_bt[i]  = T_DOUBLE; break;
1857            case 'F': in_elem_bt[i]  = T_FLOAT; break;
1858            case 'I': in_elem_bt[i]  = T_INT; break;
1859            case 'J': in_elem_bt[i]  = T_LONG; break;
1860            case 'S': in_elem_bt[i]  = T_SHORT; break;
1861            case 'Z': in_elem_bt[i]  = T_BOOLEAN; break;
1862            default: ShouldNotReachHere();
1863          }
1864        }
1865      } else {
1866        out_sig_bt[argc++] = in_sig_bt[i];
1867        in_elem_bt[i] = T_VOID;
1868      }
1869      if (in_sig_bt[i] != T_VOID) {
1870        assert(in_sig_bt[i] == ss.type(), "must match");
1871        ss.next();
1872      }
1873    }
1874  }
1875
1876  // Now figure out where the args must be stored and how much stack space
1877  // they require.
1878  int out_arg_slots;
1879  out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1880
1881  // Compute framesize for the wrapper.  We need to handlize all oops in
1882  // incoming registers
1883
1884  // Calculate the total number of stack slots we will need.
1885
1886  // First count the abi requirement plus all of the outgoing args
1887  int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1888
1889  // Now the space for the inbound oop handle area
1890  int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1891  if (is_critical_native) {
1892    // Critical natives may have to call out so they need a save area
1893    // for register arguments.
1894    int double_slots = 0;
1895    int single_slots = 0;
1896    for ( int i = 0; i < total_in_args; i++) {
1897      if (in_regs[i].first()->is_Register()) {
1898        const Register reg = in_regs[i].first()->as_Register();
1899        switch (in_sig_bt[i]) {
1900          case T_BOOLEAN:
1901          case T_BYTE:
1902          case T_SHORT:
1903          case T_CHAR:
1904          case T_INT:  single_slots++; break;
1905          case T_ARRAY:  // specific to LP64 (7145024)
1906          case T_LONG: double_slots++; break;
1907          default:  ShouldNotReachHere();
1908        }
1909      } else if (in_regs[i].first()->is_XMMRegister()) {
1910        switch (in_sig_bt[i]) {
1911          case T_FLOAT:  single_slots++; break;
1912          case T_DOUBLE: double_slots++; break;
1913          default:  ShouldNotReachHere();
1914        }
1915      } else if (in_regs[i].first()->is_FloatRegister()) {
1916        ShouldNotReachHere();
1917      }
1918    }
1919    total_save_slots = double_slots * 2 + single_slots;
1920    // align the save area
1921    if (double_slots != 0) {
1922      stack_slots = round_to(stack_slots, 2);
1923    }
1924  }
1925
1926  int oop_handle_offset = stack_slots;
1927  stack_slots += total_save_slots;
1928
1929  // Now any space we need for handlizing a klass if static method
1930
1931  int klass_slot_offset = 0;
1932  int klass_offset = -1;
1933  int lock_slot_offset = 0;
1934  bool is_static = false;
1935
1936  if (method->is_static()) {
1937    klass_slot_offset = stack_slots;
1938    stack_slots += VMRegImpl::slots_per_word;
1939    klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1940    is_static = true;
1941  }
1942
1943  // Plus a lock if needed
1944
1945  if (method->is_synchronized()) {
1946    lock_slot_offset = stack_slots;
1947    stack_slots += VMRegImpl::slots_per_word;
1948  }
1949
1950  // Now a place (+2) to save return values or temp during shuffling
1951  // + 4 for return address (which we own) and saved rbp
1952  stack_slots += 6;
1953
1954  // Ok The space we have allocated will look like:
1955  //
1956  //
1957  // FP-> |                     |
1958  //      |---------------------|
1959  //      | 2 slots for moves   |
1960  //      |---------------------|
1961  //      | lock box (if sync)  |
1962  //      |---------------------| <- lock_slot_offset
1963  //      | klass (if static)   |
1964  //      |---------------------| <- klass_slot_offset
1965  //      | oopHandle area      |
1966  //      |---------------------| <- oop_handle_offset (6 java arg registers)
1967  //      | outbound memory     |
1968  //      | based arguments     |
1969  //      |                     |
1970  //      |---------------------|
1971  //      |                     |
1972  // SP-> | out_preserved_slots |
1973  //
1974  //
1975
1976
1977  // Now compute actual number of stack words we need rounding to make
1978  // stack properly aligned.
1979  stack_slots = round_to(stack_slots, StackAlignmentInSlots);
1980
1981  int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1982
1983  // First thing make an ic check to see if we should even be here
1984
1985  // We are free to use all registers as temps without saving them and
1986  // restoring them except rbp. rbp is the only callee save register
1987  // as far as the interpreter and the compiler(s) are concerned.
1988
1989
1990  const Register ic_reg = rax;
1991  const Register receiver = j_rarg0;
1992
1993  Label hit;
1994  Label exception_pending;
1995
1996  assert_different_registers(ic_reg, receiver, rscratch1);
1997  __ verify_oop(receiver);
1998  __ load_klass(rscratch1, receiver);
1999  __ cmpq(ic_reg, rscratch1);
2000  __ jcc(Assembler::equal, hit);
2001
2002  __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
2003
2004  // Verified entry point must be aligned
2005  __ align(8);
2006
2007  __ bind(hit);
2008
2009  int vep_offset = ((intptr_t)__ pc()) - start;
2010
2011  // The instruction at the verified entry point must be 5 bytes or longer
2012  // because it can be patched on the fly by make_non_entrant. The stack bang
2013  // instruction fits that requirement.
2014
2015  // Generate stack overflow check
2016
2017  if (UseStackBanging) {
2018    __ bang_stack_with_offset(StackShadowPages*os::vm_page_size());
2019  } else {
2020    // need a 5 byte instruction to allow MT safe patching to non-entrant
2021    __ fat_nop();
2022  }
2023
2024  // Generate a new frame for the wrapper.
2025  __ enter();
2026  // -2 because return address is already present and so is saved rbp
2027  __ subptr(rsp, stack_size - 2*wordSize);
2028
2029  // Frame is now completed as far as size and linkage.
2030  int frame_complete = ((intptr_t)__ pc()) - start;
2031
2032    if (UseRTMLocking) {
2033      // Abort RTM transaction before calling JNI
2034      // because critical section will be large and will be
2035      // aborted anyway. Also nmethod could be deoptimized.
2036      __ xabort(0);
2037    }
2038
2039#ifdef ASSERT
2040    {
2041      Label L;
2042      __ mov(rax, rsp);
2043      __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
2044      __ cmpptr(rax, rsp);
2045      __ jcc(Assembler::equal, L);
2046      __ stop("improperly aligned stack");
2047      __ bind(L);
2048    }
2049#endif /* ASSERT */
2050
2051
2052  // We use r14 as the oop handle for the receiver/klass
2053  // It is callee save so it survives the call to native
2054
2055  const Register oop_handle_reg = r14;
2056
2057  if (is_critical_native) {
2058    check_needs_gc_for_critical_native(masm, stack_slots, total_c_args, total_in_args,
2059                                       oop_handle_offset, oop_maps, in_regs, in_sig_bt);
2060  }
2061
2062  //
2063  // We immediately shuffle the arguments so that any vm call we have to
2064  // make from here on out (sync slow path, jvmti, etc.) we will have
2065  // captured the oops from our caller and have a valid oopMap for
2066  // them.
2067
2068  // -----------------
2069  // The Grand Shuffle
2070
2071  // The Java calling convention is either equal (linux) or denser (win64) than the
2072  // c calling convention. However the because of the jni_env argument the c calling
2073  // convention always has at least one more (and two for static) arguments than Java.
2074  // Therefore if we move the args from java -> c backwards then we will never have
2075  // a register->register conflict and we don't have to build a dependency graph
2076  // and figure out how to break any cycles.
2077  //
2078
2079  // Record esp-based slot for receiver on stack for non-static methods
2080  int receiver_offset = -1;
2081
2082  // This is a trick. We double the stack slots so we can claim
2083  // the oops in the caller's frame. Since we are sure to have
2084  // more args than the caller doubling is enough to make
2085  // sure we can capture all the incoming oop args from the
2086  // caller.
2087  //
2088  OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2089
2090  // Mark location of rbp (someday)
2091  // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2092
2093  // Use eax, ebx as temporaries during any memory-memory moves we have to do
2094  // All inbound args are referenced based on rbp and all outbound args via rsp.
2095
2096
2097#ifdef ASSERT
2098  bool reg_destroyed[RegisterImpl::number_of_registers];
2099  bool freg_destroyed[XMMRegisterImpl::number_of_registers];
2100  for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
2101    reg_destroyed[r] = false;
2102  }
2103  for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
2104    freg_destroyed[f] = false;
2105  }
2106
2107#endif /* ASSERT */
2108
2109  // This may iterate in two different directions depending on the
2110  // kind of native it is.  The reason is that for regular JNI natives
2111  // the incoming and outgoing registers are offset upwards and for
2112  // critical natives they are offset down.
2113  GrowableArray<int> arg_order(2 * total_in_args);
2114  VMRegPair tmp_vmreg;
2115  tmp_vmreg.set1(rbx->as_VMReg());
2116
2117  if (!is_critical_native) {
2118    for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2119      arg_order.push(i);
2120      arg_order.push(c_arg);
2121    }
2122  } else {
2123    // Compute a valid move order, using tmp_vmreg to break any cycles
2124    ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
2125  }
2126
2127  int temploc = -1;
2128  for (int ai = 0; ai < arg_order.length(); ai += 2) {
2129    int i = arg_order.at(ai);
2130    int c_arg = arg_order.at(ai + 1);
2131    __ block_comment(err_msg("move %d -> %d", i, c_arg));
2132    if (c_arg == -1) {
2133      assert(is_critical_native, "should only be required for critical natives");
2134      // This arg needs to be moved to a temporary
2135      __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
2136      in_regs[i] = tmp_vmreg;
2137      temploc = i;
2138      continue;
2139    } else if (i == -1) {
2140      assert(is_critical_native, "should only be required for critical natives");
2141      // Read from the temporary location
2142      assert(temploc != -1, "must be valid");
2143      i = temploc;
2144      temploc = -1;
2145    }
2146#ifdef ASSERT
2147    if (in_regs[i].first()->is_Register()) {
2148      assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2149    } else if (in_regs[i].first()->is_XMMRegister()) {
2150      assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2151    }
2152    if (out_regs[c_arg].first()->is_Register()) {
2153      reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2154    } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2155      freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2156    }
2157#endif /* ASSERT */
2158    switch (in_sig_bt[i]) {
2159      case T_ARRAY:
2160        if (is_critical_native) {
2161          unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
2162          c_arg++;
2163#ifdef ASSERT
2164          if (out_regs[c_arg].first()->is_Register()) {
2165            reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2166          } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2167            freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2168          }
2169#endif
2170          break;
2171        }
2172      case T_OBJECT:
2173        assert(!is_critical_native, "no oop arguments");
2174        object_move(masm, map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2175                    ((i == 0) && (!is_static)),
2176                    &receiver_offset);
2177        break;
2178      case T_VOID:
2179        break;
2180
2181      case T_FLOAT:
2182        float_move(masm, in_regs[i], out_regs[c_arg]);
2183          break;
2184
2185      case T_DOUBLE:
2186        assert( i + 1 < total_in_args &&
2187                in_sig_bt[i + 1] == T_VOID &&
2188                out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2189        double_move(masm, in_regs[i], out_regs[c_arg]);
2190        break;
2191
2192      case T_LONG :
2193        long_move(masm, in_regs[i], out_regs[c_arg]);
2194        break;
2195
2196      case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2197
2198      default:
2199        move32_64(masm, in_regs[i], out_regs[c_arg]);
2200    }
2201  }
2202
2203  int c_arg;
2204
2205  // Pre-load a static method's oop into r14.  Used both by locking code and
2206  // the normal JNI call code.
2207  if (!is_critical_native) {
2208    // point c_arg at the first arg that is already loaded in case we
2209    // need to spill before we call out
2210    c_arg = total_c_args - total_in_args;
2211
2212    if (method->is_static()) {
2213
2214      //  load oop into a register
2215      __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2216
2217      // Now handlize the static class mirror it's known not-null.
2218      __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2219      map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2220
2221      // Now get the handle
2222      __ lea(oop_handle_reg, Address(rsp, klass_offset));
2223      // store the klass handle as second argument
2224      __ movptr(c_rarg1, oop_handle_reg);
2225      // and protect the arg if we must spill
2226      c_arg--;
2227    }
2228  } else {
2229    // For JNI critical methods we need to save all registers in save_args.
2230    c_arg = 0;
2231  }
2232
2233  // Change state to native (we save the return address in the thread, since it might not
2234  // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
2235  // points into the right code segment. It does not have to be the correct return pc.
2236  // We use the same pc/oopMap repeatedly when we call out
2237
2238  intptr_t the_pc = (intptr_t) __ pc();
2239  oop_maps->add_gc_map(the_pc - start, map);
2240
2241  __ set_last_Java_frame(rsp, noreg, (address)the_pc);
2242
2243
2244  // We have all of the arguments setup at this point. We must not touch any register
2245  // argument registers at this point (what if we save/restore them there are no oop?
2246
2247  {
2248    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2249    // protect the args we've loaded
2250    save_args(masm, total_c_args, c_arg, out_regs);
2251    __ mov_metadata(c_rarg1, method());
2252    __ call_VM_leaf(
2253      CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2254      r15_thread, c_rarg1);
2255    restore_args(masm, total_c_args, c_arg, out_regs);
2256  }
2257
2258  // RedefineClasses() tracing support for obsolete method entry
2259  if (RC_TRACE_IN_RANGE(0x00001000, 0x00002000)) {
2260    // protect the args we've loaded
2261    save_args(masm, total_c_args, c_arg, out_regs);
2262    __ mov_metadata(c_rarg1, method());
2263    __ call_VM_leaf(
2264      CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2265      r15_thread, c_rarg1);
2266    restore_args(masm, total_c_args, c_arg, out_regs);
2267  }
2268
2269  // Lock a synchronized method
2270
2271  // Register definitions used by locking and unlocking
2272
2273  const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2274  const Register obj_reg  = rbx;  // Will contain the oop
2275  const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2276  const Register old_hdr  = r13;  // value of old header at unlock time
2277
2278  Label slow_path_lock;
2279  Label lock_done;
2280
2281  if (method->is_synchronized()) {
2282    assert(!is_critical_native, "unhandled");
2283
2284
2285    const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2286
2287    // Get the handle (the 2nd argument)
2288    __ mov(oop_handle_reg, c_rarg1);
2289
2290    // Get address of the box
2291
2292    __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2293
2294    // Load the oop from the handle
2295    __ movptr(obj_reg, Address(oop_handle_reg, 0));
2296
2297    if (UseBiasedLocking) {
2298      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, false, lock_done, &slow_path_lock);
2299    }
2300
2301    // Load immediate 1 into swap_reg %rax
2302    __ movl(swap_reg, 1);
2303
2304    // Load (object->mark() | 1) into swap_reg %rax
2305    __ orptr(swap_reg, Address(obj_reg, 0));
2306
2307    // Save (object->mark() | 1) into BasicLock's displaced header
2308    __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2309
2310    if (os::is_MP()) {
2311      __ lock();
2312    }
2313
2314    // src -> dest iff dest == rax else rax <- dest
2315    __ cmpxchgptr(lock_reg, Address(obj_reg, 0));
2316    __ jcc(Assembler::equal, lock_done);
2317
2318    // Hmm should this move to the slow path code area???
2319
2320    // Test if the oopMark is an obvious stack pointer, i.e.,
2321    //  1) (mark & 3) == 0, and
2322    //  2) rsp <= mark < mark + os::pagesize()
2323    // These 3 tests can be done by evaluating the following
2324    // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2325    // assuming both stack pointer and pagesize have their
2326    // least significant 2 bits clear.
2327    // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2328
2329    __ subptr(swap_reg, rsp);
2330    __ andptr(swap_reg, 3 - os::vm_page_size());
2331
2332    // Save the test result, for recursive case, the result is zero
2333    __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2334    __ jcc(Assembler::notEqual, slow_path_lock);
2335
2336    // Slow path will re-enter here
2337
2338    __ bind(lock_done);
2339  }
2340
2341
2342  // Finally just about ready to make the JNI call
2343
2344
2345  // get JNIEnv* which is first argument to native
2346  if (!is_critical_native) {
2347    __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2348  }
2349
2350  // Now set thread in native
2351  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2352
2353  __ call(RuntimeAddress(native_func));
2354
2355  // Verify or restore cpu control state after JNI call
2356  __ restore_cpu_control_state_after_jni();
2357
2358  // Unpack native results.
2359  switch (ret_type) {
2360  case T_BOOLEAN: __ c2bool(rax);            break;
2361  case T_CHAR   : __ movzwl(rax, rax);      break;
2362  case T_BYTE   : __ sign_extend_byte (rax); break;
2363  case T_SHORT  : __ sign_extend_short(rax); break;
2364  case T_INT    : /* nothing to do */        break;
2365  case T_DOUBLE :
2366  case T_FLOAT  :
2367    // Result is in xmm0 we'll save as needed
2368    break;
2369  case T_ARRAY:                 // Really a handle
2370  case T_OBJECT:                // Really a handle
2371      break; // can't de-handlize until after safepoint check
2372  case T_VOID: break;
2373  case T_LONG: break;
2374  default       : ShouldNotReachHere();
2375  }
2376
2377  // Switch thread to "native transition" state before reading the synchronization state.
2378  // This additional state is necessary because reading and testing the synchronization
2379  // state is not atomic w.r.t. GC, as this scenario demonstrates:
2380  //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2381  //     VM thread changes sync state to synchronizing and suspends threads for GC.
2382  //     Thread A is resumed to finish this native method, but doesn't block here since it
2383  //     didn't see any synchronization is progress, and escapes.
2384  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2385
2386  if(os::is_MP()) {
2387    if (UseMembar) {
2388      // Force this write out before the read below
2389      __ membar(Assembler::Membar_mask_bits(
2390           Assembler::LoadLoad | Assembler::LoadStore |
2391           Assembler::StoreLoad | Assembler::StoreStore));
2392    } else {
2393      // Write serialization page so VM thread can do a pseudo remote membar.
2394      // We use the current thread pointer to calculate a thread specific
2395      // offset to write to within the page. This minimizes bus traffic
2396      // due to cache line collision.
2397      __ serialize_memory(r15_thread, rcx);
2398    }
2399  }
2400
2401  Label after_transition;
2402
2403  // check for safepoint operation in progress and/or pending suspend requests
2404  {
2405    Label Continue;
2406
2407    __ cmp32(ExternalAddress((address)SafepointSynchronize::address_of_state()),
2408             SafepointSynchronize::_not_synchronized);
2409
2410    Label L;
2411    __ jcc(Assembler::notEqual, L);
2412    __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2413    __ jcc(Assembler::equal, Continue);
2414    __ bind(L);
2415
2416    // Don't use call_VM as it will see a possible pending exception and forward it
2417    // and never return here preventing us from clearing _last_native_pc down below.
2418    // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2419    // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2420    // by hand.
2421    //
2422    save_native_result(masm, ret_type, stack_slots);
2423    __ mov(c_rarg0, r15_thread);
2424    __ mov(r12, rsp); // remember sp
2425    __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2426    __ andptr(rsp, -16); // align stack as required by ABI
2427    if (!is_critical_native) {
2428      __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2429    } else {
2430      __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition)));
2431    }
2432    __ mov(rsp, r12); // restore sp
2433    __ reinit_heapbase();
2434    // Restore any method result value
2435    restore_native_result(masm, ret_type, stack_slots);
2436
2437    if (is_critical_native) {
2438      // The call above performed the transition to thread_in_Java so
2439      // skip the transition logic below.
2440      __ jmpb(after_transition);
2441    }
2442
2443    __ bind(Continue);
2444  }
2445
2446  // change thread state
2447  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2448  __ bind(after_transition);
2449
2450  Label reguard;
2451  Label reguard_done;
2452  __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), JavaThread::stack_guard_yellow_disabled);
2453  __ jcc(Assembler::equal, reguard);
2454  __ bind(reguard_done);
2455
2456  // native result if any is live
2457
2458  // Unlock
2459  Label unlock_done;
2460  Label slow_path_unlock;
2461  if (method->is_synchronized()) {
2462
2463    // Get locked oop from the handle we passed to jni
2464    __ movptr(obj_reg, Address(oop_handle_reg, 0));
2465
2466    Label done;
2467
2468    if (UseBiasedLocking) {
2469      __ biased_locking_exit(obj_reg, old_hdr, done);
2470    }
2471
2472    // Simple recursive lock?
2473
2474    __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2475    __ jcc(Assembler::equal, done);
2476
2477    // Must save rax if if it is live now because cmpxchg must use it
2478    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2479      save_native_result(masm, ret_type, stack_slots);
2480    }
2481
2482
2483    // get address of the stack lock
2484    __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2485    //  get old displaced header
2486    __ movptr(old_hdr, Address(rax, 0));
2487
2488    // Atomic swap old header if oop still contains the stack lock
2489    if (os::is_MP()) {
2490      __ lock();
2491    }
2492    __ cmpxchgptr(old_hdr, Address(obj_reg, 0));
2493    __ jcc(Assembler::notEqual, slow_path_unlock);
2494
2495    // slow path re-enters here
2496    __ bind(unlock_done);
2497    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2498      restore_native_result(masm, ret_type, stack_slots);
2499    }
2500
2501    __ bind(done);
2502
2503  }
2504  {
2505    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2506    save_native_result(masm, ret_type, stack_slots);
2507    __ mov_metadata(c_rarg1, method());
2508    __ call_VM_leaf(
2509         CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2510         r15_thread, c_rarg1);
2511    restore_native_result(masm, ret_type, stack_slots);
2512  }
2513
2514  __ reset_last_Java_frame(false, true);
2515
2516  // Unpack oop result
2517  if (ret_type == T_OBJECT || ret_type == T_ARRAY) {
2518      Label L;
2519      __ testptr(rax, rax);
2520      __ jcc(Assembler::zero, L);
2521      __ movptr(rax, Address(rax, 0));
2522      __ bind(L);
2523      __ verify_oop(rax);
2524  }
2525
2526  if (!is_critical_native) {
2527    // reset handle block
2528    __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2529    __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2530  }
2531
2532  // pop our frame
2533
2534  __ leave();
2535
2536  if (!is_critical_native) {
2537    // Any exception pending?
2538    __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2539    __ jcc(Assembler::notEqual, exception_pending);
2540  }
2541
2542  // Return
2543
2544  __ ret(0);
2545
2546  // Unexpected paths are out of line and go here
2547
2548  if (!is_critical_native) {
2549    // forward the exception
2550    __ bind(exception_pending);
2551
2552    // and forward the exception
2553    __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2554  }
2555
2556  // Slow path locking & unlocking
2557  if (method->is_synchronized()) {
2558
2559    // BEGIN Slow path lock
2560    __ bind(slow_path_lock);
2561
2562    // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2563    // args are (oop obj, BasicLock* lock, JavaThread* thread)
2564
2565    // protect the args we've loaded
2566    save_args(masm, total_c_args, c_arg, out_regs);
2567
2568    __ mov(c_rarg0, obj_reg);
2569    __ mov(c_rarg1, lock_reg);
2570    __ mov(c_rarg2, r15_thread);
2571
2572    // Not a leaf but we have last_Java_frame setup as we want
2573    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2574    restore_args(masm, total_c_args, c_arg, out_regs);
2575
2576#ifdef ASSERT
2577    { Label L;
2578    __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2579    __ jcc(Assembler::equal, L);
2580    __ stop("no pending exception allowed on exit from monitorenter");
2581    __ bind(L);
2582    }
2583#endif
2584    __ jmp(lock_done);
2585
2586    // END Slow path lock
2587
2588    // BEGIN Slow path unlock
2589    __ bind(slow_path_unlock);
2590
2591    // If we haven't already saved the native result we must save it now as xmm registers
2592    // are still exposed.
2593
2594    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2595      save_native_result(masm, ret_type, stack_slots);
2596    }
2597
2598    __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2599
2600    __ mov(c_rarg0, obj_reg);
2601    __ mov(c_rarg2, r15_thread);
2602    __ mov(r12, rsp); // remember sp
2603    __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2604    __ andptr(rsp, -16); // align stack as required by ABI
2605
2606    // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2607    // NOTE that obj_reg == rbx currently
2608    __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2609    __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2610
2611    // args are (oop obj, BasicLock* lock, JavaThread* thread)
2612    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2613    __ mov(rsp, r12); // restore sp
2614    __ reinit_heapbase();
2615#ifdef ASSERT
2616    {
2617      Label L;
2618      __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2619      __ jcc(Assembler::equal, L);
2620      __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2621      __ bind(L);
2622    }
2623#endif /* ASSERT */
2624
2625    __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2626
2627    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2628      restore_native_result(masm, ret_type, stack_slots);
2629    }
2630    __ jmp(unlock_done);
2631
2632    // END Slow path unlock
2633
2634  } // synchronized
2635
2636  // SLOW PATH Reguard the stack if needed
2637
2638  __ bind(reguard);
2639  save_native_result(masm, ret_type, stack_slots);
2640  __ mov(r12, rsp); // remember sp
2641  __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2642  __ andptr(rsp, -16); // align stack as required by ABI
2643  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2644  __ mov(rsp, r12); // restore sp
2645  __ reinit_heapbase();
2646  restore_native_result(masm, ret_type, stack_slots);
2647  // and continue
2648  __ jmp(reguard_done);
2649
2650
2651
2652  __ flush();
2653
2654  nmethod *nm = nmethod::new_native_nmethod(method,
2655                                            compile_id,
2656                                            masm->code(),
2657                                            vep_offset,
2658                                            frame_complete,
2659                                            stack_slots / VMRegImpl::slots_per_word,
2660                                            (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2661                                            in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2662                                            oop_maps);
2663
2664  if (is_critical_native) {
2665    nm->set_lazy_critical_native(true);
2666  }
2667
2668  return nm;
2669
2670}
2671
2672// this function returns the adjust size (in number of words) to a c2i adapter
2673// activation for use during deoptimization
2674int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2675  return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2676}
2677
2678
2679uint SharedRuntime::out_preserve_stack_slots() {
2680  return 0;
2681}
2682
2683//------------------------------generate_deopt_blob----------------------------
2684void SharedRuntime::generate_deopt_blob() {
2685  // Allocate space for the code
2686  ResourceMark rm;
2687  // Setup code generation tools
2688  CodeBuffer buffer("deopt_blob", 2048, 1024);
2689  MacroAssembler* masm = new MacroAssembler(&buffer);
2690  int frame_size_in_words;
2691  OopMap* map = NULL;
2692  OopMapSet *oop_maps = new OopMapSet();
2693
2694  // -------------
2695  // This code enters when returning to a de-optimized nmethod.  A return
2696  // address has been pushed on the the stack, and return values are in
2697  // registers.
2698  // If we are doing a normal deopt then we were called from the patched
2699  // nmethod from the point we returned to the nmethod. So the return
2700  // address on the stack is wrong by NativeCall::instruction_size
2701  // We will adjust the value so it looks like we have the original return
2702  // address on the stack (like when we eagerly deoptimized).
2703  // In the case of an exception pending when deoptimizing, we enter
2704  // with a return address on the stack that points after the call we patched
2705  // into the exception handler. We have the following register state from,
2706  // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2707  //    rax: exception oop
2708  //    rbx: exception handler
2709  //    rdx: throwing pc
2710  // So in this case we simply jam rdx into the useless return address and
2711  // the stack looks just like we want.
2712  //
2713  // At this point we need to de-opt.  We save the argument return
2714  // registers.  We call the first C routine, fetch_unroll_info().  This
2715  // routine captures the return values and returns a structure which
2716  // describes the current frame size and the sizes of all replacement frames.
2717  // The current frame is compiled code and may contain many inlined
2718  // functions, each with their own JVM state.  We pop the current frame, then
2719  // push all the new frames.  Then we call the C routine unpack_frames() to
2720  // populate these frames.  Finally unpack_frames() returns us the new target
2721  // address.  Notice that callee-save registers are BLOWN here; they have
2722  // already been captured in the vframeArray at the time the return PC was
2723  // patched.
2724  address start = __ pc();
2725  Label cont;
2726
2727  // Prolog for non exception case!
2728
2729  // Save everything in sight.
2730  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
2731
2732  // Normal deoptimization.  Save exec mode for unpack_frames.
2733  __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2734  __ jmp(cont);
2735
2736  int reexecute_offset = __ pc() - start;
2737
2738  // Reexecute case
2739  // return address is the pc describes what bci to do re-execute at
2740
2741  // No need to update map as each call to save_live_registers will produce identical oopmap
2742  (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
2743
2744  __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2745  __ jmp(cont);
2746
2747  int exception_offset = __ pc() - start;
2748
2749  // Prolog for exception case
2750
2751  // all registers are dead at this entry point, except for rax, and
2752  // rdx which contain the exception oop and exception pc
2753  // respectively.  Set them in TLS and fall thru to the
2754  // unpack_with_exception_in_tls entry point.
2755
2756  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2757  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2758
2759  int exception_in_tls_offset = __ pc() - start;
2760
2761  // new implementation because exception oop is now passed in JavaThread
2762
2763  // Prolog for exception case
2764  // All registers must be preserved because they might be used by LinearScan
2765  // Exceptiop oop and throwing PC are passed in JavaThread
2766  // tos: stack at point of call to method that threw the exception (i.e. only
2767  // args are on the stack, no return address)
2768
2769  // make room on stack for the return address
2770  // It will be patched later with the throwing pc. The correct value is not
2771  // available now because loading it from memory would destroy registers.
2772  __ push(0);
2773
2774  // Save everything in sight.
2775  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
2776
2777  // Now it is safe to overwrite any register
2778
2779  // Deopt during an exception.  Save exec mode for unpack_frames.
2780  __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2781
2782  // load throwing pc from JavaThread and patch it as the return address
2783  // of the current frame. Then clear the field in JavaThread
2784
2785  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2786  __ movptr(Address(rbp, wordSize), rdx);
2787  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2788
2789#ifdef ASSERT
2790  // verify that there is really an exception oop in JavaThread
2791  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2792  __ verify_oop(rax);
2793
2794  // verify that there is no pending exception
2795  Label no_pending_exception;
2796  __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2797  __ testptr(rax, rax);
2798  __ jcc(Assembler::zero, no_pending_exception);
2799  __ stop("must not have pending exception here");
2800  __ bind(no_pending_exception);
2801#endif
2802
2803  __ bind(cont);
2804
2805  // Call C code.  Need thread and this frame, but NOT official VM entry
2806  // crud.  We cannot block on this call, no GC can happen.
2807  //
2808  // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2809
2810  // fetch_unroll_info needs to call last_java_frame().
2811
2812  __ set_last_Java_frame(noreg, noreg, NULL);
2813#ifdef ASSERT
2814  { Label L;
2815    __ cmpptr(Address(r15_thread,
2816                    JavaThread::last_Java_fp_offset()),
2817            (int32_t)0);
2818    __ jcc(Assembler::equal, L);
2819    __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2820    __ bind(L);
2821  }
2822#endif // ASSERT
2823  __ mov(c_rarg0, r15_thread);
2824  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2825
2826  // Need to have an oopmap that tells fetch_unroll_info where to
2827  // find any register it might need.
2828  oop_maps->add_gc_map(__ pc() - start, map);
2829
2830  __ reset_last_Java_frame(false, false);
2831
2832  // Load UnrollBlock* into rdi
2833  __ mov(rdi, rax);
2834
2835   Label noException;
2836  __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2837  __ jcc(Assembler::notEqual, noException);
2838  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2839  // QQQ this is useless it was NULL above
2840  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2841  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2842  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2843
2844  __ verify_oop(rax);
2845
2846  // Overwrite the result registers with the exception results.
2847  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2848  // I think this is useless
2849  __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2850
2851  __ bind(noException);
2852
2853  // Only register save data is on the stack.
2854  // Now restore the result registers.  Everything else is either dead
2855  // or captured in the vframeArray.
2856  RegisterSaver::restore_result_registers(masm);
2857
2858  // All of the register save area has been popped of the stack. Only the
2859  // return address remains.
2860
2861  // Pop all the frames we must move/replace.
2862  //
2863  // Frame picture (youngest to oldest)
2864  // 1: self-frame (no frame link)
2865  // 2: deopting frame  (no frame link)
2866  // 3: caller of deopting frame (could be compiled/interpreted).
2867  //
2868  // Note: by leaving the return address of self-frame on the stack
2869  // and using the size of frame 2 to adjust the stack
2870  // when we are done the return to frame 3 will still be on the stack.
2871
2872  // Pop deoptimized frame
2873  __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2874  __ addptr(rsp, rcx);
2875
2876  // rsp should be pointing at the return address to the caller (3)
2877
2878  // Pick up the initial fp we should save
2879  // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2880  __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2881
2882#ifdef ASSERT
2883  // Compilers generate code that bang the stack by as much as the
2884  // interpreter would need. So this stack banging should never
2885  // trigger a fault. Verify that it does not on non product builds.
2886  if (UseStackBanging) {
2887    __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2888    __ bang_stack_size(rbx, rcx);
2889  }
2890#endif
2891
2892  // Load address of array of frame pcs into rcx
2893  __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2894
2895  // Trash the old pc
2896  __ addptr(rsp, wordSize);
2897
2898  // Load address of array of frame sizes into rsi
2899  __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2900
2901  // Load counter into rdx
2902  __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2903
2904  // Now adjust the caller's stack to make up for the extra locals
2905  // but record the original sp so that we can save it in the skeletal interpreter
2906  // frame and the stack walking of interpreter_sender will get the unextended sp
2907  // value and not the "real" sp value.
2908
2909  const Register sender_sp = r8;
2910
2911  __ mov(sender_sp, rsp);
2912  __ movl(rbx, Address(rdi,
2913                       Deoptimization::UnrollBlock::
2914                       caller_adjustment_offset_in_bytes()));
2915  __ subptr(rsp, rbx);
2916
2917  // Push interpreter frames in a loop
2918  Label loop;
2919  __ bind(loop);
2920  __ movptr(rbx, Address(rsi, 0));      // Load frame size
2921#ifdef CC_INTERP
2922  __ subptr(rbx, 4*wordSize);           // we'll push pc and ebp by hand and
2923#ifdef ASSERT
2924  __ push(0xDEADDEAD);                  // Make a recognizable pattern
2925  __ push(0xDEADDEAD);
2926#else /* ASSERT */
2927  __ subptr(rsp, 2*wordSize);           // skip the "static long no_param"
2928#endif /* ASSERT */
2929#else
2930  __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2931#endif // CC_INTERP
2932  __ pushptr(Address(rcx, 0));          // Save return address
2933  __ enter();                           // Save old & set new ebp
2934  __ subptr(rsp, rbx);                  // Prolog
2935#ifdef CC_INTERP
2936  __ movptr(Address(rbp,
2937                  -(sizeof(BytecodeInterpreter)) + in_bytes(byte_offset_of(BytecodeInterpreter, _sender_sp))),
2938            sender_sp); // Make it walkable
2939#else /* CC_INTERP */
2940  // This value is corrected by layout_activation_impl
2941  __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2942  __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2943#endif /* CC_INTERP */
2944  __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2945  __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2946  __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2947  __ decrementl(rdx);                   // Decrement counter
2948  __ jcc(Assembler::notZero, loop);
2949  __ pushptr(Address(rcx, 0));          // Save final return address
2950
2951  // Re-push self-frame
2952  __ enter();                           // Save old & set new ebp
2953
2954  // Allocate a full sized register save area.
2955  // Return address and rbp are in place, so we allocate two less words.
2956  __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2957
2958  // Restore frame locals after moving the frame
2959  __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2960  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2961
2962  // Call C code.  Need thread but NOT official VM entry
2963  // crud.  We cannot block on this call, no GC can happen.  Call should
2964  // restore return values to their stack-slots with the new SP.
2965  //
2966  // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2967
2968  // Use rbp because the frames look interpreted now
2969  // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2970  // Don't need the precise return PC here, just precise enough to point into this code blob.
2971  address the_pc = __ pc();
2972  __ set_last_Java_frame(noreg, rbp, the_pc);
2973
2974  __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2975  __ mov(c_rarg0, r15_thread);
2976  __ movl(c_rarg1, r14); // second arg: exec_mode
2977  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2978  // Revert SP alignment after call since we're going to do some SP relative addressing below
2979  __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2980
2981  // Set an oopmap for the call site
2982  // Use the same PC we used for the last java frame
2983  oop_maps->add_gc_map(the_pc - start,
2984                       new OopMap( frame_size_in_words, 0 ));
2985
2986  // Clear fp AND pc
2987  __ reset_last_Java_frame(true, true);
2988
2989  // Collect return values
2990  __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2991  __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2992  // I think this is useless (throwing pc?)
2993  __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2994
2995  // Pop self-frame.
2996  __ leave();                           // Epilog
2997
2998  // Jump to interpreter
2999  __ ret(0);
3000
3001  // Make sure all code is generated
3002  masm->flush();
3003
3004  _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3005  _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3006}
3007
3008#ifdef COMPILER2
3009//------------------------------generate_uncommon_trap_blob--------------------
3010void SharedRuntime::generate_uncommon_trap_blob() {
3011  // Allocate space for the code
3012  ResourceMark rm;
3013  // Setup code generation tools
3014  CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
3015  MacroAssembler* masm = new MacroAssembler(&buffer);
3016
3017  assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3018
3019  address start = __ pc();
3020
3021  if (UseRTMLocking) {
3022    // Abort RTM transaction before possible nmethod deoptimization.
3023    __ xabort(0);
3024  }
3025
3026  // Push self-frame.  We get here with a return address on the
3027  // stack, so rsp is 8-byte aligned until we allocate our frame.
3028  __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
3029
3030  // No callee saved registers. rbp is assumed implicitly saved
3031  __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3032
3033  // compiler left unloaded_class_index in j_rarg0 move to where the
3034  // runtime expects it.
3035  __ movl(c_rarg1, j_rarg0);
3036
3037  __ set_last_Java_frame(noreg, noreg, NULL);
3038
3039  // Call C code.  Need thread but NOT official VM entry
3040  // crud.  We cannot block on this call, no GC can happen.  Call should
3041  // capture callee-saved registers as well as return values.
3042  // Thread is in rdi already.
3043  //
3044  // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
3045
3046  __ mov(c_rarg0, r15_thread);
3047  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
3048
3049  // Set an oopmap for the call site
3050  OopMapSet* oop_maps = new OopMapSet();
3051  OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
3052
3053  // location of rbp is known implicitly by the frame sender code
3054
3055  oop_maps->add_gc_map(__ pc() - start, map);
3056
3057  __ reset_last_Java_frame(false, false);
3058
3059  // Load UnrollBlock* into rdi
3060  __ mov(rdi, rax);
3061
3062  // Pop all the frames we must move/replace.
3063  //
3064  // Frame picture (youngest to oldest)
3065  // 1: self-frame (no frame link)
3066  // 2: deopting frame  (no frame link)
3067  // 3: caller of deopting frame (could be compiled/interpreted).
3068
3069  // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
3070  __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
3071
3072  // Pop deoptimized frame (int)
3073  __ movl(rcx, Address(rdi,
3074                       Deoptimization::UnrollBlock::
3075                       size_of_deoptimized_frame_offset_in_bytes()));
3076  __ addptr(rsp, rcx);
3077
3078  // rsp should be pointing at the return address to the caller (3)
3079
3080  // Pick up the initial fp we should save
3081  // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3082  __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
3083
3084#ifdef ASSERT
3085  // Compilers generate code that bang the stack by as much as the
3086  // interpreter would need. So this stack banging should never
3087  // trigger a fault. Verify that it does not on non product builds.
3088  if (UseStackBanging) {
3089    __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
3090    __ bang_stack_size(rbx, rcx);
3091  }
3092#endif
3093
3094  // Load address of array of frame pcs into rcx (address*)
3095  __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
3096
3097  // Trash the return pc
3098  __ addptr(rsp, wordSize);
3099
3100  // Load address of array of frame sizes into rsi (intptr_t*)
3101  __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
3102
3103  // Counter
3104  __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
3105
3106  // Now adjust the caller's stack to make up for the extra locals but
3107  // record the original sp so that we can save it in the skeletal
3108  // interpreter frame and the stack walking of interpreter_sender
3109  // will get the unextended sp value and not the "real" sp value.
3110
3111  const Register sender_sp = r8;
3112
3113  __ mov(sender_sp, rsp);
3114  __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
3115  __ subptr(rsp, rbx);
3116
3117  // Push interpreter frames in a loop
3118  Label loop;
3119  __ bind(loop);
3120  __ movptr(rbx, Address(rsi, 0)); // Load frame size
3121  __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3122  __ pushptr(Address(rcx, 0));     // Save return address
3123  __ enter();                      // Save old & set new rbp
3124  __ subptr(rsp, rbx);             // Prolog
3125#ifdef CC_INTERP
3126  __ movptr(Address(rbp,
3127                  -(sizeof(BytecodeInterpreter)) + in_bytes(byte_offset_of(BytecodeInterpreter, _sender_sp))),
3128            sender_sp); // Make it walkable
3129#else // CC_INTERP
3130  __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3131            sender_sp);            // Make it walkable
3132  // This value is corrected by layout_activation_impl
3133  __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
3134#endif // CC_INTERP
3135  __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3136  __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3137  __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3138  __ decrementl(rdx);              // Decrement counter
3139  __ jcc(Assembler::notZero, loop);
3140  __ pushptr(Address(rcx, 0));     // Save final return address
3141
3142  // Re-push self-frame
3143  __ enter();                 // Save old & set new rbp
3144  __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3145                              // Prolog
3146
3147  // Use rbp because the frames look interpreted now
3148  // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3149  // Don't need the precise return PC here, just precise enough to point into this code blob.
3150  address the_pc = __ pc();
3151  __ set_last_Java_frame(noreg, rbp, the_pc);
3152
3153  // Call C code.  Need thread but NOT official VM entry
3154  // crud.  We cannot block on this call, no GC can happen.  Call should
3155  // restore return values to their stack-slots with the new SP.
3156  // Thread is in rdi already.
3157  //
3158  // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3159
3160  __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3161  __ mov(c_rarg0, r15_thread);
3162  __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3163  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3164
3165  // Set an oopmap for the call site
3166  // Use the same PC we used for the last java frame
3167  oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3168
3169  // Clear fp AND pc
3170  __ reset_last_Java_frame(true, true);
3171
3172  // Pop self-frame.
3173  __ leave();                 // Epilog
3174
3175  // Jump to interpreter
3176  __ ret(0);
3177
3178  // Make sure all code is generated
3179  masm->flush();
3180
3181  _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3182                                                 SimpleRuntimeFrame::framesize >> 1);
3183}
3184#endif // COMPILER2
3185
3186
3187//------------------------------generate_handler_blob------
3188//
3189// Generate a special Compile2Runtime blob that saves all registers,
3190// and setup oopmap.
3191//
3192SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3193  assert(StubRoutines::forward_exception_entry() != NULL,
3194         "must be generated before");
3195
3196  ResourceMark rm;
3197  OopMapSet *oop_maps = new OopMapSet();
3198  OopMap* map;
3199
3200  // Allocate space for the code.  Setup code generation tools.
3201  CodeBuffer buffer("handler_blob", 2048, 1024);
3202  MacroAssembler* masm = new MacroAssembler(&buffer);
3203
3204  address start   = __ pc();
3205  address call_pc = NULL;
3206  int frame_size_in_words;
3207  bool cause_return = (poll_type == POLL_AT_RETURN);
3208  bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3209
3210  if (UseRTMLocking) {
3211    // Abort RTM transaction before calling runtime
3212    // because critical section will be large and will be
3213    // aborted anyway. Also nmethod could be deoptimized.
3214    __ xabort(0);
3215  }
3216
3217  // Make room for return address (or push it again)
3218  if (!cause_return) {
3219    __ push(rbx);
3220  }
3221
3222  // Save registers, fpu state, and flags
3223  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
3224
3225  // The following is basically a call_VM.  However, we need the precise
3226  // address of the call in order to generate an oopmap. Hence, we do all the
3227  // work outselves.
3228
3229  __ set_last_Java_frame(noreg, noreg, NULL);
3230
3231  // The return address must always be correct so that frame constructor never
3232  // sees an invalid pc.
3233
3234  if (!cause_return) {
3235    // overwrite the dummy value we pushed on entry
3236    __ movptr(c_rarg0, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3237    __ movptr(Address(rbp, wordSize), c_rarg0);
3238  }
3239
3240  // Do the call
3241  __ mov(c_rarg0, r15_thread);
3242  __ call(RuntimeAddress(call_ptr));
3243
3244  // Set an oopmap for the call site.  This oopmap will map all
3245  // oop-registers and debug-info registers as callee-saved.  This
3246  // will allow deoptimization at this safepoint to find all possible
3247  // debug-info recordings, as well as let GC find all oops.
3248
3249  oop_maps->add_gc_map( __ pc() - start, map);
3250
3251  Label noException;
3252
3253  __ reset_last_Java_frame(false, false);
3254
3255  __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3256  __ jcc(Assembler::equal, noException);
3257
3258  // Exception pending
3259
3260  RegisterSaver::restore_live_registers(masm, save_vectors);
3261
3262  __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3263
3264  // No exception case
3265  __ bind(noException);
3266
3267  // Normal exit, restore registers and exit.
3268  RegisterSaver::restore_live_registers(masm, save_vectors);
3269
3270  __ ret(0);
3271
3272  // Make sure all code is generated
3273  masm->flush();
3274
3275  // Fill-out other meta info
3276  return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3277}
3278
3279//
3280// generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3281//
3282// Generate a stub that calls into vm to find out the proper destination
3283// of a java call. All the argument registers are live at this point
3284// but since this is generic code we don't know what they are and the caller
3285// must do any gc of the args.
3286//
3287RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3288  assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3289
3290  // allocate space for the code
3291  ResourceMark rm;
3292
3293  CodeBuffer buffer(name, 1000, 512);
3294  MacroAssembler* masm                = new MacroAssembler(&buffer);
3295
3296  int frame_size_in_words;
3297
3298  OopMapSet *oop_maps = new OopMapSet();
3299  OopMap* map = NULL;
3300
3301  int start = __ offset();
3302
3303  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
3304
3305  int frame_complete = __ offset();
3306
3307  __ set_last_Java_frame(noreg, noreg, NULL);
3308
3309  __ mov(c_rarg0, r15_thread);
3310
3311  __ call(RuntimeAddress(destination));
3312
3313
3314  // Set an oopmap for the call site.
3315  // We need this not only for callee-saved registers, but also for volatile
3316  // registers that the compiler might be keeping live across a safepoint.
3317
3318  oop_maps->add_gc_map( __ offset() - start, map);
3319
3320  // rax contains the address we are going to jump to assuming no exception got installed
3321
3322  // clear last_Java_sp
3323  __ reset_last_Java_frame(false, false);
3324  // check for pending exceptions
3325  Label pending;
3326  __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3327  __ jcc(Assembler::notEqual, pending);
3328
3329  // get the returned Method*
3330  __ get_vm_result_2(rbx, r15_thread);
3331  __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3332
3333  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3334
3335  RegisterSaver::restore_live_registers(masm);
3336
3337  // We are back the the original state on entry and ready to go.
3338
3339  __ jmp(rax);
3340
3341  // Pending exception after the safepoint
3342
3343  __ bind(pending);
3344
3345  RegisterSaver::restore_live_registers(masm);
3346
3347  // exception pending => remove activation and forward to exception handler
3348
3349  __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3350
3351  __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3352  __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3353
3354  // -------------
3355  // make sure all code is generated
3356  masm->flush();
3357
3358  // return the  blob
3359  // frame_size_words or bytes??
3360  return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3361}
3362
3363
3364//------------------------------Montgomery multiplication------------------------
3365//
3366
3367#ifndef _WINDOWS
3368
3369#define ASM_SUBTRACT
3370
3371#ifdef ASM_SUBTRACT
3372// Subtract 0:b from carry:a.  Return carry.
3373static unsigned long
3374sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
3375  long i = 0, cnt = len;
3376  unsigned long tmp;
3377  asm volatile("clc; "
3378               "0: ; "
3379               "mov (%[b], %[i], 8), %[tmp]; "
3380               "sbb %[tmp], (%[a], %[i], 8); "
3381               "inc %[i]; dec %[cnt]; "
3382               "jne 0b; "
3383               "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3384               : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3385               : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3386               : "memory");
3387  return tmp;
3388}
3389#else // ASM_SUBTRACT
3390typedef int __attribute__((mode(TI))) int128;
3391
3392// Subtract 0:b from carry:a.  Return carry.
3393static unsigned long
3394sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
3395  int128 tmp = 0;
3396  int i;
3397  for (i = 0; i < len; i++) {
3398    tmp += a[i];
3399    tmp -= b[i];
3400    a[i] = tmp;
3401    tmp >>= 64;
3402    assert(-1 <= tmp && tmp <= 0, "invariant");
3403  }
3404  return tmp + carry;
3405}
3406#endif // ! ASM_SUBTRACT
3407
3408// Multiply (unsigned) Long A by Long B, accumulating the double-
3409// length result into the accumulator formed of T0, T1, and T2.
3410#define MACC(A, B, T0, T1, T2)                                  \
3411do {                                                            \
3412  unsigned long hi, lo;                                         \
3413  __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3414           : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3415           : "r"(A), "a"(B) : "cc");                            \
3416 } while(0)
3417
3418// As above, but add twice the double-length result into the
3419// accumulator.
3420#define MACC2(A, B, T0, T1, T2)                                 \
3421do {                                                            \
3422  unsigned long hi, lo;                                         \
3423  __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3424           "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3425           : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3426           : "r"(A), "a"(B) : "cc");                            \
3427 } while(0)
3428
3429// Fast Montgomery multiplication.  The derivation of the algorithm is
3430// in  A Cryptographic Library for the Motorola DSP56000,
3431// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3432
3433static void __attribute__((noinline))
3434montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
3435                    unsigned long m[], unsigned long inv, int len) {
3436  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3437  int i;
3438
3439  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3440
3441  for (i = 0; i < len; i++) {
3442    int j;
3443    for (j = 0; j < i; j++) {
3444      MACC(a[j], b[i-j], t0, t1, t2);
3445      MACC(m[j], n[i-j], t0, t1, t2);
3446    }
3447    MACC(a[i], b[0], t0, t1, t2);
3448    m[i] = t0 * inv;
3449    MACC(m[i], n[0], t0, t1, t2);
3450
3451    assert(t0 == 0, "broken Montgomery multiply");
3452
3453    t0 = t1; t1 = t2; t2 = 0;
3454  }
3455
3456  for (i = len; i < 2*len; i++) {
3457    int j;
3458    for (j = i-len+1; j < len; j++) {
3459      MACC(a[j], b[i-j], t0, t1, t2);
3460      MACC(m[j], n[i-j], t0, t1, t2);
3461    }
3462    m[i-len] = t0;
3463    t0 = t1; t1 = t2; t2 = 0;
3464  }
3465
3466  while (t0)
3467    t0 = sub(m, n, t0, len);
3468}
3469
3470// Fast Montgomery squaring.  This uses asymptotically 25% fewer
3471// multiplies so it should be up to 25% faster than Montgomery
3472// multiplication.  However, its loop control is more complex and it
3473// may actually run slower on some machines.
3474
3475static void __attribute__((noinline))
3476montgomery_square(unsigned long a[], unsigned long n[],
3477                  unsigned long m[], unsigned long inv, int len) {
3478  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3479  int i;
3480
3481  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3482
3483  for (i = 0; i < len; i++) {
3484    int j;
3485    int end = (i+1)/2;
3486    for (j = 0; j < end; j++) {
3487      MACC2(a[j], a[i-j], t0, t1, t2);
3488      MACC(m[j], n[i-j], t0, t1, t2);
3489    }
3490    if ((i & 1) == 0) {
3491      MACC(a[j], a[j], t0, t1, t2);
3492    }
3493    for (; j < i; j++) {
3494      MACC(m[j], n[i-j], t0, t1, t2);
3495    }
3496    m[i] = t0 * inv;
3497    MACC(m[i], n[0], t0, t1, t2);
3498
3499    assert(t0 == 0, "broken Montgomery square");
3500
3501    t0 = t1; t1 = t2; t2 = 0;
3502  }
3503
3504  for (i = len; i < 2*len; i++) {
3505    int start = i-len+1;
3506    int end = start + (len - start)/2;
3507    int j;
3508    for (j = start; j < end; j++) {
3509      MACC2(a[j], a[i-j], t0, t1, t2);
3510      MACC(m[j], n[i-j], t0, t1, t2);
3511    }
3512    if ((i & 1) == 0) {
3513      MACC(a[j], a[j], t0, t1, t2);
3514    }
3515    for (; j < len; j++) {
3516      MACC(m[j], n[i-j], t0, t1, t2);
3517    }
3518    m[i-len] = t0;
3519    t0 = t1; t1 = t2; t2 = 0;
3520  }
3521
3522  while (t0)
3523    t0 = sub(m, n, t0, len);
3524}
3525
3526// Swap words in a longword.
3527static unsigned long swap(unsigned long x) {
3528  return (x << 32) | (x >> 32);
3529}
3530
3531// Copy len longwords from s to d, word-swapping as we go.  The
3532// destination array is reversed.
3533static void reverse_words(unsigned long *s, unsigned long *d, int len) {
3534  d += len;
3535  while(len-- > 0) {
3536    d--;
3537    *d = swap(*s);
3538    s++;
3539  }
3540}
3541
3542// The threshold at which squaring is advantageous was determined
3543// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3544#define MONTGOMERY_SQUARING_THRESHOLD 64
3545
3546void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3547                                        jint len, jlong inv,
3548                                        jint *m_ints) {
3549  assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3550  int longwords = len/2;
3551
3552  // Make very sure we don't use so much space that the stack might
3553  // overflow.  512 jints corresponds to an 16384-bit integer and
3554  // will use here a total of 8k bytes of stack space.
3555  int total_allocation = longwords * sizeof (unsigned long) * 4;
3556  guarantee(total_allocation <= 8192, "must be");
3557  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
3558
3559  // Local scratch arrays
3560  unsigned long
3561    *a = scratch + 0 * longwords,
3562    *b = scratch + 1 * longwords,
3563    *n = scratch + 2 * longwords,
3564    *m = scratch + 3 * longwords;
3565
3566  reverse_words((unsigned long *)a_ints, a, longwords);
3567  reverse_words((unsigned long *)b_ints, b, longwords);
3568  reverse_words((unsigned long *)n_ints, n, longwords);
3569
3570  ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
3571
3572  reverse_words(m, (unsigned long *)m_ints, longwords);
3573}
3574
3575void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3576                                      jint len, jlong inv,
3577                                      jint *m_ints) {
3578  assert(len % 2 == 0, "array length in montgomery_square must be even");
3579  int longwords = len/2;
3580
3581  // Make very sure we don't use so much space that the stack might
3582  // overflow.  512 jints corresponds to an 16384-bit integer and
3583  // will use here a total of 6k bytes of stack space.
3584  int total_allocation = longwords * sizeof (unsigned long) * 3;
3585  guarantee(total_allocation <= 8192, "must be");
3586  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
3587
3588  // Local scratch arrays
3589  unsigned long
3590    *a = scratch + 0 * longwords,
3591    *n = scratch + 1 * longwords,
3592    *m = scratch + 2 * longwords;
3593
3594  reverse_words((unsigned long *)a_ints, a, longwords);
3595  reverse_words((unsigned long *)n_ints, n, longwords);
3596
3597  if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3598    ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
3599  } else {
3600    ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
3601  }
3602
3603  reverse_words(m, (unsigned long *)m_ints, longwords);
3604}
3605
3606#endif // WINDOWS
3607
3608#ifdef COMPILER2
3609// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3610//
3611//------------------------------generate_exception_blob---------------------------
3612// creates exception blob at the end
3613// Using exception blob, this code is jumped from a compiled method.
3614// (see emit_exception_handler in x86_64.ad file)
3615//
3616// Given an exception pc at a call we call into the runtime for the
3617// handler in this method. This handler might merely restore state
3618// (i.e. callee save registers) unwind the frame and jump to the
3619// exception handler for the nmethod if there is no Java level handler
3620// for the nmethod.
3621//
3622// This code is entered with a jmp.
3623//
3624// Arguments:
3625//   rax: exception oop
3626//   rdx: exception pc
3627//
3628// Results:
3629//   rax: exception oop
3630//   rdx: exception pc in caller or ???
3631//   destination: exception handler of caller
3632//
3633// Note: the exception pc MUST be at a call (precise debug information)
3634//       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3635//
3636
3637void OptoRuntime::generate_exception_blob() {
3638  assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3639  assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3640  assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3641
3642  assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3643
3644  // Allocate space for the code
3645  ResourceMark rm;
3646  // Setup code generation tools
3647  CodeBuffer buffer("exception_blob", 2048, 1024);
3648  MacroAssembler* masm = new MacroAssembler(&buffer);
3649
3650
3651  address start = __ pc();
3652
3653  // Exception pc is 'return address' for stack walker
3654  __ push(rdx);
3655  __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3656
3657  // Save callee-saved registers.  See x86_64.ad.
3658
3659  // rbp is an implicitly saved callee saved register (i.e., the calling
3660  // convention will save/restore it in the prolog/epilog). Other than that
3661  // there are no callee save registers now that adapter frames are gone.
3662
3663  __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3664
3665  // Store exception in Thread object. We cannot pass any arguments to the
3666  // handle_exception call, since we do not want to make any assumption
3667  // about the size of the frame where the exception happened in.
3668  // c_rarg0 is either rdi (Linux) or rcx (Windows).
3669  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3670  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3671
3672  // This call does all the hard work.  It checks if an exception handler
3673  // exists in the method.
3674  // If so, it returns the handler address.
3675  // If not, it prepares for stack-unwinding, restoring the callee-save
3676  // registers of the frame being removed.
3677  //
3678  // address OptoRuntime::handle_exception_C(JavaThread* thread)
3679
3680  // At a method handle call, the stack may not be properly aligned
3681  // when returning with an exception.
3682  address the_pc = __ pc();
3683  __ set_last_Java_frame(noreg, noreg, the_pc);
3684  __ mov(c_rarg0, r15_thread);
3685  __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3686  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3687
3688  // Set an oopmap for the call site.  This oopmap will only be used if we
3689  // are unwinding the stack.  Hence, all locations will be dead.
3690  // Callee-saved registers will be the same as the frame above (i.e.,
3691  // handle_exception_stub), since they were restored when we got the
3692  // exception.
3693
3694  OopMapSet* oop_maps = new OopMapSet();
3695
3696  oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3697
3698  __ reset_last_Java_frame(false, true);
3699
3700  // Restore callee-saved registers
3701
3702  // rbp is an implicitly saved callee-saved register (i.e., the calling
3703  // convention will save restore it in prolog/epilog) Other than that
3704  // there are no callee save registers now that adapter frames are gone.
3705
3706  __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3707
3708  __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3709  __ pop(rdx);                  // No need for exception pc anymore
3710
3711  // rax: exception handler
3712
3713  // We have a handler in rax (could be deopt blob).
3714  __ mov(r8, rax);
3715
3716  // Get the exception oop
3717  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3718  // Get the exception pc in case we are deoptimized
3719  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3720#ifdef ASSERT
3721  __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3722  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3723#endif
3724  // Clear the exception oop so GC no longer processes it as a root.
3725  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3726
3727  // rax: exception oop
3728  // r8:  exception handler
3729  // rdx: exception pc
3730  // Jump to handler
3731
3732  __ jmp(r8);
3733
3734  // Make sure all code is generated
3735  masm->flush();
3736
3737  // Set exception blob
3738  _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3739}
3740#endif // COMPILER2
3741