sharedRuntime_x86_64.cpp revision 304:dc7f315e41f7
1/*
2 * Copyright 2003-2008 Sun Microsystems, Inc.  All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
20 * CA 95054 USA or visit www.sun.com if you need additional information or
21 * have any questions.
22 *
23 */
24
25#include "incls/_precompiled.incl"
26#include "incls/_sharedRuntime_x86_64.cpp.incl"
27
28DeoptimizationBlob *SharedRuntime::_deopt_blob;
29#ifdef COMPILER2
30UncommonTrapBlob   *SharedRuntime::_uncommon_trap_blob;
31ExceptionBlob      *OptoRuntime::_exception_blob;
32#endif // COMPILER2
33
34SafepointBlob      *SharedRuntime::_polling_page_safepoint_handler_blob;
35SafepointBlob      *SharedRuntime::_polling_page_return_handler_blob;
36RuntimeStub*       SharedRuntime::_wrong_method_blob;
37RuntimeStub*       SharedRuntime::_ic_miss_blob;
38RuntimeStub*       SharedRuntime::_resolve_opt_virtual_call_blob;
39RuntimeStub*       SharedRuntime::_resolve_virtual_call_blob;
40RuntimeStub*       SharedRuntime::_resolve_static_call_blob;
41
42#define __ masm->
43
44class SimpleRuntimeFrame {
45
46  public:
47
48  // Most of the runtime stubs have this simple frame layout.
49  // This class exists to make the layout shared in one place.
50  // Offsets are for compiler stack slots, which are jints.
51  enum layout {
52    // The frame sender code expects that rbp will be in the "natural" place and
53    // will override any oopMap setting for it. We must therefore force the layout
54    // so that it agrees with the frame sender code.
55    rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
56    rbp_off2,
57    return_off, return_off2,
58    framesize
59  };
60};
61
62class RegisterSaver {
63  // Capture info about frame layout.  Layout offsets are in jint
64  // units because compiler frame slots are jints.
65#define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
66  enum layout {
67    fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
68    xmm_off       = fpu_state_off + 160/BytesPerInt,            // offset in fxsave save area
69    DEF_XMM_OFFS(0),
70    DEF_XMM_OFFS(1),
71    DEF_XMM_OFFS(2),
72    DEF_XMM_OFFS(3),
73    DEF_XMM_OFFS(4),
74    DEF_XMM_OFFS(5),
75    DEF_XMM_OFFS(6),
76    DEF_XMM_OFFS(7),
77    DEF_XMM_OFFS(8),
78    DEF_XMM_OFFS(9),
79    DEF_XMM_OFFS(10),
80    DEF_XMM_OFFS(11),
81    DEF_XMM_OFFS(12),
82    DEF_XMM_OFFS(13),
83    DEF_XMM_OFFS(14),
84    DEF_XMM_OFFS(15),
85    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
86    fpu_stateH_end,
87    r15_off, r15H_off,
88    r14_off, r14H_off,
89    r13_off, r13H_off,
90    r12_off, r12H_off,
91    r11_off, r11H_off,
92    r10_off, r10H_off,
93    r9_off,  r9H_off,
94    r8_off,  r8H_off,
95    rdi_off, rdiH_off,
96    rsi_off, rsiH_off,
97    ignore_off, ignoreH_off,  // extra copy of rbp
98    rsp_off, rspH_off,
99    rbx_off, rbxH_off,
100    rdx_off, rdxH_off,
101    rcx_off, rcxH_off,
102    rax_off, raxH_off,
103    // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
104    align_off, alignH_off,
105    flags_off, flagsH_off,
106    // The frame sender code expects that rbp will be in the "natural" place and
107    // will override any oopMap setting for it. We must therefore force the layout
108    // so that it agrees with the frame sender code.
109    rbp_off, rbpH_off,        // copy of rbp we will restore
110    return_off, returnH_off,  // slot for return address
111    reg_save_size             // size in compiler stack slots
112  };
113
114 public:
115  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words);
116  static void restore_live_registers(MacroAssembler* masm);
117
118  // Offsets into the register save area
119  // Used by deoptimization when it is managing result register
120  // values on its own
121
122  static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
123  static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
124  static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
125  static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
126  static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
127
128  // During deoptimization only the result registers need to be restored,
129  // all the other values have already been extracted.
130  static void restore_result_registers(MacroAssembler* masm);
131};
132
133OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words) {
134
135  // Always make the frame size 16-byte aligned
136  int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
137                                     reg_save_size*BytesPerInt, 16);
138  // OopMap frame size is in compiler stack slots (jint's) not bytes or words
139  int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
140  // The caller will allocate additional_frame_words
141  int additional_frame_slots = additional_frame_words*wordSize / BytesPerInt;
142  // CodeBlob frame size is in words.
143  int frame_size_in_words = frame_size_in_bytes / wordSize;
144  *total_frame_words = frame_size_in_words;
145
146  // Save registers, fpu state, and flags.
147  // We assume caller has already pushed the return address onto the
148  // stack, so rsp is 8-byte aligned here.
149  // We push rpb twice in this sequence because we want the real rbp
150  // to be under the return like a normal enter.
151
152  __ enter();          // rsp becomes 16-byte aligned here
153  __ push_CPU_state(); // Push a multiple of 16 bytes
154  if (frame::arg_reg_save_area_bytes != 0) {
155    // Allocate argument register save area
156    __ subptr(rsp, frame::arg_reg_save_area_bytes);
157  }
158
159  // Set an oopmap for the call site.  This oopmap will map all
160  // oop-registers and debug-info registers as callee-saved.  This
161  // will allow deoptimization at this safepoint to find all possible
162  // debug-info recordings, as well as let GC find all oops.
163
164  OopMapSet *oop_maps = new OopMapSet();
165  OopMap* map = new OopMap(frame_size_in_slots, 0);
166  map->set_callee_saved(VMRegImpl::stack2reg( rax_off  + additional_frame_slots), rax->as_VMReg());
167  map->set_callee_saved(VMRegImpl::stack2reg( rcx_off  + additional_frame_slots), rcx->as_VMReg());
168  map->set_callee_saved(VMRegImpl::stack2reg( rdx_off  + additional_frame_slots), rdx->as_VMReg());
169  map->set_callee_saved(VMRegImpl::stack2reg( rbx_off  + additional_frame_slots), rbx->as_VMReg());
170  // rbp location is known implicitly by the frame sender code, needs no oopmap
171  // and the location where rbp was saved by is ignored
172  map->set_callee_saved(VMRegImpl::stack2reg( rsi_off  + additional_frame_slots), rsi->as_VMReg());
173  map->set_callee_saved(VMRegImpl::stack2reg( rdi_off  + additional_frame_slots), rdi->as_VMReg());
174  map->set_callee_saved(VMRegImpl::stack2reg( r8_off   + additional_frame_slots), r8->as_VMReg());
175  map->set_callee_saved(VMRegImpl::stack2reg( r9_off   + additional_frame_slots), r9->as_VMReg());
176  map->set_callee_saved(VMRegImpl::stack2reg( r10_off  + additional_frame_slots), r10->as_VMReg());
177  map->set_callee_saved(VMRegImpl::stack2reg( r11_off  + additional_frame_slots), r11->as_VMReg());
178  map->set_callee_saved(VMRegImpl::stack2reg( r12_off  + additional_frame_slots), r12->as_VMReg());
179  map->set_callee_saved(VMRegImpl::stack2reg( r13_off  + additional_frame_slots), r13->as_VMReg());
180  map->set_callee_saved(VMRegImpl::stack2reg( r14_off  + additional_frame_slots), r14->as_VMReg());
181  map->set_callee_saved(VMRegImpl::stack2reg( r15_off  + additional_frame_slots), r15->as_VMReg());
182  map->set_callee_saved(VMRegImpl::stack2reg(xmm0_off  + additional_frame_slots), xmm0->as_VMReg());
183  map->set_callee_saved(VMRegImpl::stack2reg(xmm1_off  + additional_frame_slots), xmm1->as_VMReg());
184  map->set_callee_saved(VMRegImpl::stack2reg(xmm2_off  + additional_frame_slots), xmm2->as_VMReg());
185  map->set_callee_saved(VMRegImpl::stack2reg(xmm3_off  + additional_frame_slots), xmm3->as_VMReg());
186  map->set_callee_saved(VMRegImpl::stack2reg(xmm4_off  + additional_frame_slots), xmm4->as_VMReg());
187  map->set_callee_saved(VMRegImpl::stack2reg(xmm5_off  + additional_frame_slots), xmm5->as_VMReg());
188  map->set_callee_saved(VMRegImpl::stack2reg(xmm6_off  + additional_frame_slots), xmm6->as_VMReg());
189  map->set_callee_saved(VMRegImpl::stack2reg(xmm7_off  + additional_frame_slots), xmm7->as_VMReg());
190  map->set_callee_saved(VMRegImpl::stack2reg(xmm8_off  + additional_frame_slots), xmm8->as_VMReg());
191  map->set_callee_saved(VMRegImpl::stack2reg(xmm9_off  + additional_frame_slots), xmm9->as_VMReg());
192  map->set_callee_saved(VMRegImpl::stack2reg(xmm10_off + additional_frame_slots), xmm10->as_VMReg());
193  map->set_callee_saved(VMRegImpl::stack2reg(xmm11_off + additional_frame_slots), xmm11->as_VMReg());
194  map->set_callee_saved(VMRegImpl::stack2reg(xmm12_off + additional_frame_slots), xmm12->as_VMReg());
195  map->set_callee_saved(VMRegImpl::stack2reg(xmm13_off + additional_frame_slots), xmm13->as_VMReg());
196  map->set_callee_saved(VMRegImpl::stack2reg(xmm14_off + additional_frame_slots), xmm14->as_VMReg());
197  map->set_callee_saved(VMRegImpl::stack2reg(xmm15_off + additional_frame_slots), xmm15->as_VMReg());
198
199  // %%% These should all be a waste but we'll keep things as they were for now
200  if (true) {
201    map->set_callee_saved(VMRegImpl::stack2reg( raxH_off  + additional_frame_slots),
202                          rax->as_VMReg()->next());
203    map->set_callee_saved(VMRegImpl::stack2reg( rcxH_off  + additional_frame_slots),
204                          rcx->as_VMReg()->next());
205    map->set_callee_saved(VMRegImpl::stack2reg( rdxH_off  + additional_frame_slots),
206                          rdx->as_VMReg()->next());
207    map->set_callee_saved(VMRegImpl::stack2reg( rbxH_off  + additional_frame_slots),
208                          rbx->as_VMReg()->next());
209    // rbp location is known implicitly by the frame sender code, needs no oopmap
210    map->set_callee_saved(VMRegImpl::stack2reg( rsiH_off  + additional_frame_slots),
211                          rsi->as_VMReg()->next());
212    map->set_callee_saved(VMRegImpl::stack2reg( rdiH_off  + additional_frame_slots),
213                          rdi->as_VMReg()->next());
214    map->set_callee_saved(VMRegImpl::stack2reg( r8H_off   + additional_frame_slots),
215                          r8->as_VMReg()->next());
216    map->set_callee_saved(VMRegImpl::stack2reg( r9H_off   + additional_frame_slots),
217                          r9->as_VMReg()->next());
218    map->set_callee_saved(VMRegImpl::stack2reg( r10H_off  + additional_frame_slots),
219                          r10->as_VMReg()->next());
220    map->set_callee_saved(VMRegImpl::stack2reg( r11H_off  + additional_frame_slots),
221                          r11->as_VMReg()->next());
222    map->set_callee_saved(VMRegImpl::stack2reg( r12H_off  + additional_frame_slots),
223                          r12->as_VMReg()->next());
224    map->set_callee_saved(VMRegImpl::stack2reg( r13H_off  + additional_frame_slots),
225                          r13->as_VMReg()->next());
226    map->set_callee_saved(VMRegImpl::stack2reg( r14H_off  + additional_frame_slots),
227                          r14->as_VMReg()->next());
228    map->set_callee_saved(VMRegImpl::stack2reg( r15H_off  + additional_frame_slots),
229                          r15->as_VMReg()->next());
230    map->set_callee_saved(VMRegImpl::stack2reg(xmm0H_off  + additional_frame_slots),
231                          xmm0->as_VMReg()->next());
232    map->set_callee_saved(VMRegImpl::stack2reg(xmm1H_off  + additional_frame_slots),
233                          xmm1->as_VMReg()->next());
234    map->set_callee_saved(VMRegImpl::stack2reg(xmm2H_off  + additional_frame_slots),
235                          xmm2->as_VMReg()->next());
236    map->set_callee_saved(VMRegImpl::stack2reg(xmm3H_off  + additional_frame_slots),
237                          xmm3->as_VMReg()->next());
238    map->set_callee_saved(VMRegImpl::stack2reg(xmm4H_off  + additional_frame_slots),
239                          xmm4->as_VMReg()->next());
240    map->set_callee_saved(VMRegImpl::stack2reg(xmm5H_off  + additional_frame_slots),
241                          xmm5->as_VMReg()->next());
242    map->set_callee_saved(VMRegImpl::stack2reg(xmm6H_off  + additional_frame_slots),
243                          xmm6->as_VMReg()->next());
244    map->set_callee_saved(VMRegImpl::stack2reg(xmm7H_off  + additional_frame_slots),
245                          xmm7->as_VMReg()->next());
246    map->set_callee_saved(VMRegImpl::stack2reg(xmm8H_off  + additional_frame_slots),
247                          xmm8->as_VMReg()->next());
248    map->set_callee_saved(VMRegImpl::stack2reg(xmm9H_off  + additional_frame_slots),
249                          xmm9->as_VMReg()->next());
250    map->set_callee_saved(VMRegImpl::stack2reg(xmm10H_off + additional_frame_slots),
251                          xmm10->as_VMReg()->next());
252    map->set_callee_saved(VMRegImpl::stack2reg(xmm11H_off + additional_frame_slots),
253                          xmm11->as_VMReg()->next());
254    map->set_callee_saved(VMRegImpl::stack2reg(xmm12H_off + additional_frame_slots),
255                          xmm12->as_VMReg()->next());
256    map->set_callee_saved(VMRegImpl::stack2reg(xmm13H_off + additional_frame_slots),
257                          xmm13->as_VMReg()->next());
258    map->set_callee_saved(VMRegImpl::stack2reg(xmm14H_off + additional_frame_slots),
259                          xmm14->as_VMReg()->next());
260    map->set_callee_saved(VMRegImpl::stack2reg(xmm15H_off + additional_frame_slots),
261                          xmm15->as_VMReg()->next());
262  }
263
264  return map;
265}
266
267void RegisterSaver::restore_live_registers(MacroAssembler* masm) {
268  if (frame::arg_reg_save_area_bytes != 0) {
269    // Pop arg register save area
270    __ addptr(rsp, frame::arg_reg_save_area_bytes);
271  }
272  // Recover CPU state
273  __ pop_CPU_state();
274  // Get the rbp described implicitly by the calling convention (no oopMap)
275  __ pop(rbp);
276}
277
278void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
279
280  // Just restore result register. Only used by deoptimization. By
281  // now any callee save register that needs to be restored to a c2
282  // caller of the deoptee has been extracted into the vframeArray
283  // and will be stuffed into the c2i adapter we create for later
284  // restoration so only result registers need to be restored here.
285
286  // Restore fp result register
287  __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
288  // Restore integer result register
289  __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
290  __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
291
292  // Pop all of the register save are off the stack except the return address
293  __ addptr(rsp, return_offset_in_bytes());
294}
295
296// The java_calling_convention describes stack locations as ideal slots on
297// a frame with no abi restrictions. Since we must observe abi restrictions
298// (like the placement of the register window) the slots must be biased by
299// the following value.
300static int reg2offset_in(VMReg r) {
301  // Account for saved rbp and return address
302  // This should really be in_preserve_stack_slots
303  return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
304}
305
306static int reg2offset_out(VMReg r) {
307  return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
308}
309
310// ---------------------------------------------------------------------------
311// Read the array of BasicTypes from a signature, and compute where the
312// arguments should go.  Values in the VMRegPair regs array refer to 4-byte
313// quantities.  Values less than VMRegImpl::stack0 are registers, those above
314// refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
315// as framesizes are fixed.
316// VMRegImpl::stack0 refers to the first slot 0(sp).
317// and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
318// up to RegisterImpl::number_of_registers) are the 64-bit
319// integer registers.
320
321// Note: the INPUTS in sig_bt are in units of Java argument words, which are
322// either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
323// units regardless of build. Of course for i486 there is no 64 bit build
324
325// The Java calling convention is a "shifted" version of the C ABI.
326// By skipping the first C ABI register we can call non-static jni methods
327// with small numbers of arguments without having to shuffle the arguments
328// at all. Since we control the java ABI we ought to at least get some
329// advantage out of it.
330
331int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
332                                           VMRegPair *regs,
333                                           int total_args_passed,
334                                           int is_outgoing) {
335
336  // Create the mapping between argument positions and
337  // registers.
338  static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
339    j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
340  };
341  static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
342    j_farg0, j_farg1, j_farg2, j_farg3,
343    j_farg4, j_farg5, j_farg6, j_farg7
344  };
345
346
347  uint int_args = 0;
348  uint fp_args = 0;
349  uint stk_args = 0; // inc by 2 each time
350
351  for (int i = 0; i < total_args_passed; i++) {
352    switch (sig_bt[i]) {
353    case T_BOOLEAN:
354    case T_CHAR:
355    case T_BYTE:
356    case T_SHORT:
357    case T_INT:
358      if (int_args < Argument::n_int_register_parameters_j) {
359        regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
360      } else {
361        regs[i].set1(VMRegImpl::stack2reg(stk_args));
362        stk_args += 2;
363      }
364      break;
365    case T_VOID:
366      // halves of T_LONG or T_DOUBLE
367      assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
368      regs[i].set_bad();
369      break;
370    case T_LONG:
371      assert(sig_bt[i + 1] == T_VOID, "expecting half");
372      // fall through
373    case T_OBJECT:
374    case T_ARRAY:
375    case T_ADDRESS:
376      if (int_args < Argument::n_int_register_parameters_j) {
377        regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
378      } else {
379        regs[i].set2(VMRegImpl::stack2reg(stk_args));
380        stk_args += 2;
381      }
382      break;
383    case T_FLOAT:
384      if (fp_args < Argument::n_float_register_parameters_j) {
385        regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
386      } else {
387        regs[i].set1(VMRegImpl::stack2reg(stk_args));
388        stk_args += 2;
389      }
390      break;
391    case T_DOUBLE:
392      assert(sig_bt[i + 1] == T_VOID, "expecting half");
393      if (fp_args < Argument::n_float_register_parameters_j) {
394        regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
395      } else {
396        regs[i].set2(VMRegImpl::stack2reg(stk_args));
397        stk_args += 2;
398      }
399      break;
400    default:
401      ShouldNotReachHere();
402      break;
403    }
404  }
405
406  return round_to(stk_args, 2);
407}
408
409// Patch the callers callsite with entry to compiled code if it exists.
410static void patch_callers_callsite(MacroAssembler *masm) {
411  Label L;
412  __ verify_oop(rbx);
413  __ cmpptr(Address(rbx, in_bytes(methodOopDesc::code_offset())), (int32_t)NULL_WORD);
414  __ jcc(Assembler::equal, L);
415
416  // Save the current stack pointer
417  __ mov(r13, rsp);
418  // Schedule the branch target address early.
419  // Call into the VM to patch the caller, then jump to compiled callee
420  // rax isn't live so capture return address while we easily can
421  __ movptr(rax, Address(rsp, 0));
422
423  // align stack so push_CPU_state doesn't fault
424  __ andptr(rsp, -(StackAlignmentInBytes));
425  __ push_CPU_state();
426
427
428  __ verify_oop(rbx);
429  // VM needs caller's callsite
430  // VM needs target method
431  // This needs to be a long call since we will relocate this adapter to
432  // the codeBuffer and it may not reach
433
434  // Allocate argument register save area
435  if (frame::arg_reg_save_area_bytes != 0) {
436    __ subptr(rsp, frame::arg_reg_save_area_bytes);
437  }
438  __ mov(c_rarg0, rbx);
439  __ mov(c_rarg1, rax);
440  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
441
442  // De-allocate argument register save area
443  if (frame::arg_reg_save_area_bytes != 0) {
444    __ addptr(rsp, frame::arg_reg_save_area_bytes);
445  }
446
447  __ pop_CPU_state();
448  // restore sp
449  __ mov(rsp, r13);
450  __ bind(L);
451}
452
453// Helper function to put tags in interpreter stack.
454static void  tag_stack(MacroAssembler *masm, const BasicType sig, int st_off) {
455  if (TaggedStackInterpreter) {
456    int tag_offset = st_off + Interpreter::expr_tag_offset_in_bytes(0);
457    if (sig == T_OBJECT || sig == T_ARRAY) {
458      __ movptr(Address(rsp, tag_offset), (int32_t) frame::TagReference);
459    } else if (sig == T_LONG || sig == T_DOUBLE) {
460      int next_tag_offset = st_off + Interpreter::expr_tag_offset_in_bytes(1);
461      __ movptr(Address(rsp, next_tag_offset), (int32_t) frame::TagValue);
462      __ movptr(Address(rsp, tag_offset), (int32_t) frame::TagValue);
463    } else {
464      __ movptr(Address(rsp, tag_offset), (int32_t) frame::TagValue);
465    }
466  }
467}
468
469
470static void gen_c2i_adapter(MacroAssembler *masm,
471                            int total_args_passed,
472                            int comp_args_on_stack,
473                            const BasicType *sig_bt,
474                            const VMRegPair *regs,
475                            Label& skip_fixup) {
476  // Before we get into the guts of the C2I adapter, see if we should be here
477  // at all.  We've come from compiled code and are attempting to jump to the
478  // interpreter, which means the caller made a static call to get here
479  // (vcalls always get a compiled target if there is one).  Check for a
480  // compiled target.  If there is one, we need to patch the caller's call.
481  patch_callers_callsite(masm);
482
483  __ bind(skip_fixup);
484
485  // Since all args are passed on the stack, total_args_passed *
486  // Interpreter::stackElementSize is the space we need. Plus 1 because
487  // we also account for the return address location since
488  // we store it first rather than hold it in rax across all the shuffling
489
490  int extraspace = (total_args_passed * Interpreter::stackElementSize()) + wordSize;
491
492  // stack is aligned, keep it that way
493  extraspace = round_to(extraspace, 2*wordSize);
494
495  // Get return address
496  __ pop(rax);
497
498  // set senderSP value
499  __ mov(r13, rsp);
500
501  __ subptr(rsp, extraspace);
502
503  // Store the return address in the expected location
504  __ movptr(Address(rsp, 0), rax);
505
506  // Now write the args into the outgoing interpreter space
507  for (int i = 0; i < total_args_passed; i++) {
508    if (sig_bt[i] == T_VOID) {
509      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
510      continue;
511    }
512
513    // offset to start parameters
514    int st_off   = (total_args_passed - i) * Interpreter::stackElementSize() +
515                   Interpreter::value_offset_in_bytes();
516    int next_off = st_off - Interpreter::stackElementSize();
517
518    // Say 4 args:
519    // i   st_off
520    // 0   32 T_LONG
521    // 1   24 T_VOID
522    // 2   16 T_OBJECT
523    // 3    8 T_BOOL
524    // -    0 return address
525    //
526    // However to make thing extra confusing. Because we can fit a long/double in
527    // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
528    // leaves one slot empty and only stores to a single slot. In this case the
529    // slot that is occupied is the T_VOID slot. See I said it was confusing.
530
531    VMReg r_1 = regs[i].first();
532    VMReg r_2 = regs[i].second();
533    if (!r_1->is_valid()) {
534      assert(!r_2->is_valid(), "");
535      continue;
536    }
537    if (r_1->is_stack()) {
538      // memory to memory use rax
539      int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
540      if (!r_2->is_valid()) {
541        // sign extend??
542        __ movl(rax, Address(rsp, ld_off));
543        __ movptr(Address(rsp, st_off), rax);
544        tag_stack(masm, sig_bt[i], st_off);
545
546      } else {
547
548        __ movq(rax, Address(rsp, ld_off));
549
550        // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
551        // T_DOUBLE and T_LONG use two slots in the interpreter
552        if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
553          // ld_off == LSW, ld_off+wordSize == MSW
554          // st_off == MSW, next_off == LSW
555          __ movq(Address(rsp, next_off), rax);
556#ifdef ASSERT
557          // Overwrite the unused slot with known junk
558          __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
559          __ movptr(Address(rsp, st_off), rax);
560#endif /* ASSERT */
561          tag_stack(masm, sig_bt[i], next_off);
562        } else {
563          __ movq(Address(rsp, st_off), rax);
564          tag_stack(masm, sig_bt[i], st_off);
565        }
566      }
567    } else if (r_1->is_Register()) {
568      Register r = r_1->as_Register();
569      if (!r_2->is_valid()) {
570        // must be only an int (or less ) so move only 32bits to slot
571        // why not sign extend??
572        __ movl(Address(rsp, st_off), r);
573        tag_stack(masm, sig_bt[i], st_off);
574      } else {
575        // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
576        // T_DOUBLE and T_LONG use two slots in the interpreter
577        if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
578          // long/double in gpr
579#ifdef ASSERT
580          // Overwrite the unused slot with known junk
581          __ mov64(rax, CONST64(0xdeadffffdeadaaab));
582          __ movptr(Address(rsp, st_off), rax);
583#endif /* ASSERT */
584          __ movq(Address(rsp, next_off), r);
585          tag_stack(masm, sig_bt[i], next_off);
586        } else {
587          __ movptr(Address(rsp, st_off), r);
588          tag_stack(masm, sig_bt[i], st_off);
589        }
590      }
591    } else {
592      assert(r_1->is_XMMRegister(), "");
593      if (!r_2->is_valid()) {
594        // only a float use just part of the slot
595        __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
596        tag_stack(masm, sig_bt[i], st_off);
597      } else {
598#ifdef ASSERT
599        // Overwrite the unused slot with known junk
600        __ mov64(rax, CONST64(0xdeadffffdeadaaac));
601        __ movptr(Address(rsp, st_off), rax);
602#endif /* ASSERT */
603        __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
604        tag_stack(masm, sig_bt[i], next_off);
605      }
606    }
607  }
608
609  // Schedule the branch target address early.
610  __ movptr(rcx, Address(rbx, in_bytes(methodOopDesc::interpreter_entry_offset())));
611  __ jmp(rcx);
612}
613
614static void gen_i2c_adapter(MacroAssembler *masm,
615                            int total_args_passed,
616                            int comp_args_on_stack,
617                            const BasicType *sig_bt,
618                            const VMRegPair *regs) {
619
620  //
621  // We will only enter here from an interpreted frame and never from after
622  // passing thru a c2i. Azul allowed this but we do not. If we lose the
623  // race and use a c2i we will remain interpreted for the race loser(s).
624  // This removes all sorts of headaches on the x86 side and also eliminates
625  // the possibility of having c2i -> i2c -> c2i -> ... endless transitions.
626
627
628  // Note: r13 contains the senderSP on entry. We must preserve it since
629  // we may do a i2c -> c2i transition if we lose a race where compiled
630  // code goes non-entrant while we get args ready.
631  // In addition we use r13 to locate all the interpreter args as
632  // we must align the stack to 16 bytes on an i2c entry else we
633  // lose alignment we expect in all compiled code and register
634  // save code can segv when fxsave instructions find improperly
635  // aligned stack pointer.
636
637  __ movptr(rax, Address(rsp, 0));
638
639  // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
640  // in registers, we will occasionally have no stack args.
641  int comp_words_on_stack = 0;
642  if (comp_args_on_stack) {
643    // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
644    // registers are below.  By subtracting stack0, we either get a negative
645    // number (all values in registers) or the maximum stack slot accessed.
646
647    // Convert 4-byte c2 stack slots to words.
648    comp_words_on_stack = round_to(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
649    // Round up to miminum stack alignment, in wordSize
650    comp_words_on_stack = round_to(comp_words_on_stack, 2);
651    __ subptr(rsp, comp_words_on_stack * wordSize);
652  }
653
654
655  // Ensure compiled code always sees stack at proper alignment
656  __ andptr(rsp, -16);
657
658  // push the return address and misalign the stack that youngest frame always sees
659  // as far as the placement of the call instruction
660  __ push(rax);
661
662  // Will jump to the compiled code just as if compiled code was doing it.
663  // Pre-load the register-jump target early, to schedule it better.
664  __ movptr(r11, Address(rbx, in_bytes(methodOopDesc::from_compiled_offset())));
665
666  // Now generate the shuffle code.  Pick up all register args and move the
667  // rest through the floating point stack top.
668  for (int i = 0; i < total_args_passed; i++) {
669    if (sig_bt[i] == T_VOID) {
670      // Longs and doubles are passed in native word order, but misaligned
671      // in the 32-bit build.
672      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
673      continue;
674    }
675
676    // Pick up 0, 1 or 2 words from SP+offset.
677
678    assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
679            "scrambled load targets?");
680    // Load in argument order going down.
681    // int ld_off = (total_args_passed + comp_words_on_stack -i)*wordSize;
682    // base ld_off on r13 (sender_sp) as the stack alignment makes offsets from rsp
683    // unpredictable
684    int ld_off = ((total_args_passed - 1) - i)*Interpreter::stackElementSize();
685
686    // Point to interpreter value (vs. tag)
687    int next_off = ld_off - Interpreter::stackElementSize();
688    //
689    //
690    //
691    VMReg r_1 = regs[i].first();
692    VMReg r_2 = regs[i].second();
693    if (!r_1->is_valid()) {
694      assert(!r_2->is_valid(), "");
695      continue;
696    }
697    if (r_1->is_stack()) {
698      // Convert stack slot to an SP offset (+ wordSize to account for return address )
699      int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
700      if (!r_2->is_valid()) {
701        // sign extend???
702        __ movl(rax, Address(r13, ld_off));
703        __ movptr(Address(rsp, st_off), rax);
704      } else {
705        //
706        // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
707        // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
708        // So we must adjust where to pick up the data to match the interpreter.
709        //
710        // Interpreter local[n] == MSW, local[n+1] == LSW however locals
711        // are accessed as negative so LSW is at LOW address
712
713        // ld_off is MSW so get LSW
714        const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
715                           next_off : ld_off;
716        __ movq(rax, Address(r13, offset));
717        // st_off is LSW (i.e. reg.first())
718        __ movq(Address(rsp, st_off), rax);
719      }
720    } else if (r_1->is_Register()) {  // Register argument
721      Register r = r_1->as_Register();
722      assert(r != rax, "must be different");
723      if (r_2->is_valid()) {
724        //
725        // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
726        // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
727        // So we must adjust where to pick up the data to match the interpreter.
728
729        const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
730                           next_off : ld_off;
731
732        // this can be a misaligned move
733        __ movq(r, Address(r13, offset));
734      } else {
735        // sign extend and use a full word?
736        __ movl(r, Address(r13, ld_off));
737      }
738    } else {
739      if (!r_2->is_valid()) {
740        __ movflt(r_1->as_XMMRegister(), Address(r13, ld_off));
741      } else {
742        __ movdbl(r_1->as_XMMRegister(), Address(r13, next_off));
743      }
744    }
745  }
746
747  // 6243940 We might end up in handle_wrong_method if
748  // the callee is deoptimized as we race thru here. If that
749  // happens we don't want to take a safepoint because the
750  // caller frame will look interpreted and arguments are now
751  // "compiled" so it is much better to make this transition
752  // invisible to the stack walking code. Unfortunately if
753  // we try and find the callee by normal means a safepoint
754  // is possible. So we stash the desired callee in the thread
755  // and the vm will find there should this case occur.
756
757  __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
758
759  // put methodOop where a c2i would expect should we end up there
760  // only needed becaus eof c2 resolve stubs return methodOop as a result in
761  // rax
762  __ mov(rax, rbx);
763  __ jmp(r11);
764}
765
766// ---------------------------------------------------------------
767AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
768                                                            int total_args_passed,
769                                                            int comp_args_on_stack,
770                                                            const BasicType *sig_bt,
771                                                            const VMRegPair *regs) {
772  address i2c_entry = __ pc();
773
774  gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
775
776  // -------------------------------------------------------------------------
777  // Generate a C2I adapter.  On entry we know rbx holds the methodOop during calls
778  // to the interpreter.  The args start out packed in the compiled layout.  They
779  // need to be unpacked into the interpreter layout.  This will almost always
780  // require some stack space.  We grow the current (compiled) stack, then repack
781  // the args.  We  finally end in a jump to the generic interpreter entry point.
782  // On exit from the interpreter, the interpreter will restore our SP (lest the
783  // compiled code, which relys solely on SP and not RBP, get sick).
784
785  address c2i_unverified_entry = __ pc();
786  Label skip_fixup;
787  Label ok;
788
789  Register holder = rax;
790  Register receiver = j_rarg0;
791  Register temp = rbx;
792
793  {
794    __ verify_oop(holder);
795    __ load_klass(temp, receiver);
796    __ verify_oop(temp);
797
798    __ cmpptr(temp, Address(holder, compiledICHolderOopDesc::holder_klass_offset()));
799    __ movptr(rbx, Address(holder, compiledICHolderOopDesc::holder_method_offset()));
800    __ jcc(Assembler::equal, ok);
801    __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
802
803    __ bind(ok);
804    // Method might have been compiled since the call site was patched to
805    // interpreted if that is the case treat it as a miss so we can get
806    // the call site corrected.
807    __ cmpptr(Address(rbx, in_bytes(methodOopDesc::code_offset())), (int32_t)NULL_WORD);
808    __ jcc(Assembler::equal, skip_fixup);
809    __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
810  }
811
812  address c2i_entry = __ pc();
813
814  gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
815
816  __ flush();
817  return new AdapterHandlerEntry(i2c_entry, c2i_entry, c2i_unverified_entry);
818}
819
820int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
821                                         VMRegPair *regs,
822                                         int total_args_passed) {
823// We return the amount of VMRegImpl stack slots we need to reserve for all
824// the arguments NOT counting out_preserve_stack_slots.
825
826// NOTE: These arrays will have to change when c1 is ported
827#ifdef _WIN64
828    static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
829      c_rarg0, c_rarg1, c_rarg2, c_rarg3
830    };
831    static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
832      c_farg0, c_farg1, c_farg2, c_farg3
833    };
834#else
835    static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
836      c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
837    };
838    static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
839      c_farg0, c_farg1, c_farg2, c_farg3,
840      c_farg4, c_farg5, c_farg6, c_farg7
841    };
842#endif // _WIN64
843
844
845    uint int_args = 0;
846    uint fp_args = 0;
847    uint stk_args = 0; // inc by 2 each time
848
849    for (int i = 0; i < total_args_passed; i++) {
850      switch (sig_bt[i]) {
851      case T_BOOLEAN:
852      case T_CHAR:
853      case T_BYTE:
854      case T_SHORT:
855      case T_INT:
856        if (int_args < Argument::n_int_register_parameters_c) {
857          regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
858#ifdef _WIN64
859          fp_args++;
860          // Allocate slots for callee to stuff register args the stack.
861          stk_args += 2;
862#endif
863        } else {
864          regs[i].set1(VMRegImpl::stack2reg(stk_args));
865          stk_args += 2;
866        }
867        break;
868      case T_LONG:
869        assert(sig_bt[i + 1] == T_VOID, "expecting half");
870        // fall through
871      case T_OBJECT:
872      case T_ARRAY:
873      case T_ADDRESS:
874        if (int_args < Argument::n_int_register_parameters_c) {
875          regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
876#ifdef _WIN64
877          fp_args++;
878          stk_args += 2;
879#endif
880        } else {
881          regs[i].set2(VMRegImpl::stack2reg(stk_args));
882          stk_args += 2;
883        }
884        break;
885      case T_FLOAT:
886        if (fp_args < Argument::n_float_register_parameters_c) {
887          regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
888#ifdef _WIN64
889          int_args++;
890          // Allocate slots for callee to stuff register args the stack.
891          stk_args += 2;
892#endif
893        } else {
894          regs[i].set1(VMRegImpl::stack2reg(stk_args));
895          stk_args += 2;
896        }
897        break;
898      case T_DOUBLE:
899        assert(sig_bt[i + 1] == T_VOID, "expecting half");
900        if (fp_args < Argument::n_float_register_parameters_c) {
901          regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
902#ifdef _WIN64
903          int_args++;
904          // Allocate slots for callee to stuff register args the stack.
905          stk_args += 2;
906#endif
907        } else {
908          regs[i].set2(VMRegImpl::stack2reg(stk_args));
909          stk_args += 2;
910        }
911        break;
912      case T_VOID: // Halves of longs and doubles
913        assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
914        regs[i].set_bad();
915        break;
916      default:
917        ShouldNotReachHere();
918        break;
919      }
920    }
921#ifdef _WIN64
922  // windows abi requires that we always allocate enough stack space
923  // for 4 64bit registers to be stored down.
924  if (stk_args < 8) {
925    stk_args = 8;
926  }
927#endif // _WIN64
928
929  return stk_args;
930}
931
932// On 64 bit we will store integer like items to the stack as
933// 64 bits items (sparc abi) even though java would only store
934// 32bits for a parameter. On 32bit it will simply be 32 bits
935// So this routine will do 32->32 on 32bit and 32->64 on 64bit
936static void move32_64(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
937  if (src.first()->is_stack()) {
938    if (dst.first()->is_stack()) {
939      // stack to stack
940      __ movslq(rax, Address(rbp, reg2offset_in(src.first())));
941      __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
942    } else {
943      // stack to reg
944      __ movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
945    }
946  } else if (dst.first()->is_stack()) {
947    // reg to stack
948    // Do we really have to sign extend???
949    // __ movslq(src.first()->as_Register(), src.first()->as_Register());
950    __ movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
951  } else {
952    // Do we really have to sign extend???
953    // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
954    if (dst.first() != src.first()) {
955      __ movq(dst.first()->as_Register(), src.first()->as_Register());
956    }
957  }
958}
959
960
961// An oop arg. Must pass a handle not the oop itself
962static void object_move(MacroAssembler* masm,
963                        OopMap* map,
964                        int oop_handle_offset,
965                        int framesize_in_slots,
966                        VMRegPair src,
967                        VMRegPair dst,
968                        bool is_receiver,
969                        int* receiver_offset) {
970
971  // must pass a handle. First figure out the location we use as a handle
972
973  Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
974
975  // See if oop is NULL if it is we need no handle
976
977  if (src.first()->is_stack()) {
978
979    // Oop is already on the stack as an argument
980    int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
981    map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
982    if (is_receiver) {
983      *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
984    }
985
986    __ cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
987    __ lea(rHandle, Address(rbp, reg2offset_in(src.first())));
988    // conditionally move a NULL
989    __ cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
990  } else {
991
992    // Oop is in an a register we must store it to the space we reserve
993    // on the stack for oop_handles and pass a handle if oop is non-NULL
994
995    const Register rOop = src.first()->as_Register();
996    int oop_slot;
997    if (rOop == j_rarg0)
998      oop_slot = 0;
999    else if (rOop == j_rarg1)
1000      oop_slot = 1;
1001    else if (rOop == j_rarg2)
1002      oop_slot = 2;
1003    else if (rOop == j_rarg3)
1004      oop_slot = 3;
1005    else if (rOop == j_rarg4)
1006      oop_slot = 4;
1007    else {
1008      assert(rOop == j_rarg5, "wrong register");
1009      oop_slot = 5;
1010    }
1011
1012    oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1013    int offset = oop_slot*VMRegImpl::stack_slot_size;
1014
1015    map->set_oop(VMRegImpl::stack2reg(oop_slot));
1016    // Store oop in handle area, may be NULL
1017    __ movptr(Address(rsp, offset), rOop);
1018    if (is_receiver) {
1019      *receiver_offset = offset;
1020    }
1021
1022    __ cmpptr(rOop, (int32_t)NULL_WORD);
1023    __ lea(rHandle, Address(rsp, offset));
1024    // conditionally move a NULL from the handle area where it was just stored
1025    __ cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1026  }
1027
1028  // If arg is on the stack then place it otherwise it is already in correct reg.
1029  if (dst.first()->is_stack()) {
1030    __ movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1031  }
1032}
1033
1034// A float arg may have to do float reg int reg conversion
1035static void float_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1036  assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
1037
1038  // The calling conventions assures us that each VMregpair is either
1039  // all really one physical register or adjacent stack slots.
1040  // This greatly simplifies the cases here compared to sparc.
1041
1042  if (src.first()->is_stack()) {
1043    if (dst.first()->is_stack()) {
1044      __ movl(rax, Address(rbp, reg2offset_in(src.first())));
1045      __ movptr(Address(rsp, reg2offset_out(dst.first())), rax);
1046    } else {
1047      // stack to reg
1048      assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1049      __ movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first())));
1050    }
1051  } else if (dst.first()->is_stack()) {
1052    // reg to stack
1053    assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1054    __ movflt(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1055  } else {
1056    // reg to reg
1057    // In theory these overlap but the ordering is such that this is likely a nop
1058    if ( src.first() != dst.first()) {
1059      __ movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
1060    }
1061  }
1062}
1063
1064// A long move
1065static void long_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1066
1067  // The calling conventions assures us that each VMregpair is either
1068  // all really one physical register or adjacent stack slots.
1069  // This greatly simplifies the cases here compared to sparc.
1070
1071  if (src.is_single_phys_reg() ) {
1072    if (dst.is_single_phys_reg()) {
1073      if (dst.first() != src.first()) {
1074        __ mov(dst.first()->as_Register(), src.first()->as_Register());
1075      }
1076    } else {
1077      assert(dst.is_single_reg(), "not a stack pair");
1078      __ movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1079    }
1080  } else if (dst.is_single_phys_reg()) {
1081    assert(src.is_single_reg(),  "not a stack pair");
1082    __ movq(dst.first()->as_Register(), Address(rbp, reg2offset_out(src.first())));
1083  } else {
1084    assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
1085    __ movq(rax, Address(rbp, reg2offset_in(src.first())));
1086    __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
1087  }
1088}
1089
1090// A double move
1091static void double_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1092
1093  // The calling conventions assures us that each VMregpair is either
1094  // all really one physical register or adjacent stack slots.
1095  // This greatly simplifies the cases here compared to sparc.
1096
1097  if (src.is_single_phys_reg() ) {
1098    if (dst.is_single_phys_reg()) {
1099      // In theory these overlap but the ordering is such that this is likely a nop
1100      if ( src.first() != dst.first()) {
1101        __ movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
1102      }
1103    } else {
1104      assert(dst.is_single_reg(), "not a stack pair");
1105      __ movdbl(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1106    }
1107  } else if (dst.is_single_phys_reg()) {
1108    assert(src.is_single_reg(),  "not a stack pair");
1109    __ movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_out(src.first())));
1110  } else {
1111    assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
1112    __ movq(rax, Address(rbp, reg2offset_in(src.first())));
1113    __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
1114  }
1115}
1116
1117
1118void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1119  // We always ignore the frame_slots arg and just use the space just below frame pointer
1120  // which by this time is free to use
1121  switch (ret_type) {
1122  case T_FLOAT:
1123    __ movflt(Address(rbp, -wordSize), xmm0);
1124    break;
1125  case T_DOUBLE:
1126    __ movdbl(Address(rbp, -wordSize), xmm0);
1127    break;
1128  case T_VOID:  break;
1129  default: {
1130    __ movptr(Address(rbp, -wordSize), rax);
1131    }
1132  }
1133}
1134
1135void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1136  // We always ignore the frame_slots arg and just use the space just below frame pointer
1137  // which by this time is free to use
1138  switch (ret_type) {
1139  case T_FLOAT:
1140    __ movflt(xmm0, Address(rbp, -wordSize));
1141    break;
1142  case T_DOUBLE:
1143    __ movdbl(xmm0, Address(rbp, -wordSize));
1144    break;
1145  case T_VOID:  break;
1146  default: {
1147    __ movptr(rax, Address(rbp, -wordSize));
1148    }
1149  }
1150}
1151
1152static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1153    for ( int i = first_arg ; i < arg_count ; i++ ) {
1154      if (args[i].first()->is_Register()) {
1155        __ push(args[i].first()->as_Register());
1156      } else if (args[i].first()->is_XMMRegister()) {
1157        __ subptr(rsp, 2*wordSize);
1158        __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1159      }
1160    }
1161}
1162
1163static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1164    for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1165      if (args[i].first()->is_Register()) {
1166        __ pop(args[i].first()->as_Register());
1167      } else if (args[i].first()->is_XMMRegister()) {
1168        __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1169        __ addptr(rsp, 2*wordSize);
1170      }
1171    }
1172}
1173
1174// ---------------------------------------------------------------------------
1175// Generate a native wrapper for a given method.  The method takes arguments
1176// in the Java compiled code convention, marshals them to the native
1177// convention (handlizes oops, etc), transitions to native, makes the call,
1178// returns to java state (possibly blocking), unhandlizes any result and
1179// returns.
1180nmethod *SharedRuntime::generate_native_wrapper(MacroAssembler *masm,
1181                                                methodHandle method,
1182                                                int total_in_args,
1183                                                int comp_args_on_stack,
1184                                                BasicType *in_sig_bt,
1185                                                VMRegPair *in_regs,
1186                                                BasicType ret_type) {
1187  // Native nmethod wrappers never take possesion of the oop arguments.
1188  // So the caller will gc the arguments. The only thing we need an
1189  // oopMap for is if the call is static
1190  //
1191  // An OopMap for lock (and class if static)
1192  OopMapSet *oop_maps = new OopMapSet();
1193  intptr_t start = (intptr_t)__ pc();
1194
1195  // We have received a description of where all the java arg are located
1196  // on entry to the wrapper. We need to convert these args to where
1197  // the jni function will expect them. To figure out where they go
1198  // we convert the java signature to a C signature by inserting
1199  // the hidden arguments as arg[0] and possibly arg[1] (static method)
1200
1201  int total_c_args = total_in_args + 1;
1202  if (method->is_static()) {
1203    total_c_args++;
1204  }
1205
1206  BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1207  VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair,   total_c_args);
1208
1209  int argc = 0;
1210  out_sig_bt[argc++] = T_ADDRESS;
1211  if (method->is_static()) {
1212    out_sig_bt[argc++] = T_OBJECT;
1213  }
1214
1215  for (int i = 0; i < total_in_args ; i++ ) {
1216    out_sig_bt[argc++] = in_sig_bt[i];
1217  }
1218
1219  // Now figure out where the args must be stored and how much stack space
1220  // they require.
1221  //
1222  int out_arg_slots;
1223  out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1224
1225  // Compute framesize for the wrapper.  We need to handlize all oops in
1226  // incoming registers
1227
1228  // Calculate the total number of stack slots we will need.
1229
1230  // First count the abi requirement plus all of the outgoing args
1231  int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1232
1233  // Now the space for the inbound oop handle area
1234
1235  int oop_handle_offset = stack_slots;
1236  stack_slots += 6*VMRegImpl::slots_per_word;
1237
1238  // Now any space we need for handlizing a klass if static method
1239
1240  int oop_temp_slot_offset = 0;
1241  int klass_slot_offset = 0;
1242  int klass_offset = -1;
1243  int lock_slot_offset = 0;
1244  bool is_static = false;
1245
1246  if (method->is_static()) {
1247    klass_slot_offset = stack_slots;
1248    stack_slots += VMRegImpl::slots_per_word;
1249    klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1250    is_static = true;
1251  }
1252
1253  // Plus a lock if needed
1254
1255  if (method->is_synchronized()) {
1256    lock_slot_offset = stack_slots;
1257    stack_slots += VMRegImpl::slots_per_word;
1258  }
1259
1260  // Now a place (+2) to save return values or temp during shuffling
1261  // + 4 for return address (which we own) and saved rbp
1262  stack_slots += 6;
1263
1264  // Ok The space we have allocated will look like:
1265  //
1266  //
1267  // FP-> |                     |
1268  //      |---------------------|
1269  //      | 2 slots for moves   |
1270  //      |---------------------|
1271  //      | lock box (if sync)  |
1272  //      |---------------------| <- lock_slot_offset
1273  //      | klass (if static)   |
1274  //      |---------------------| <- klass_slot_offset
1275  //      | oopHandle area      |
1276  //      |---------------------| <- oop_handle_offset (6 java arg registers)
1277  //      | outbound memory     |
1278  //      | based arguments     |
1279  //      |                     |
1280  //      |---------------------|
1281  //      |                     |
1282  // SP-> | out_preserved_slots |
1283  //
1284  //
1285
1286
1287  // Now compute actual number of stack words we need rounding to make
1288  // stack properly aligned.
1289  stack_slots = round_to(stack_slots, 4 * VMRegImpl::slots_per_word);
1290
1291  int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1292
1293
1294  // First thing make an ic check to see if we should even be here
1295
1296  // We are free to use all registers as temps without saving them and
1297  // restoring them except rbp. rbp is the only callee save register
1298  // as far as the interpreter and the compiler(s) are concerned.
1299
1300
1301  const Register ic_reg = rax;
1302  const Register receiver = j_rarg0;
1303  const Register tmp = rdx;
1304
1305  Label ok;
1306  Label exception_pending;
1307
1308  __ verify_oop(receiver);
1309  __ push(tmp); // spill (any other registers free here???)
1310  __ load_klass(tmp, receiver);
1311  __ cmpq(ic_reg, tmp);
1312  __ jcc(Assembler::equal, ok);
1313
1314  __ pop(tmp);
1315  __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1316
1317  __ bind(ok);
1318  __ pop(tmp);
1319
1320  // Verified entry point must be aligned
1321  __ align(8);
1322
1323  int vep_offset = ((intptr_t)__ pc()) - start;
1324
1325  // The instruction at the verified entry point must be 5 bytes or longer
1326  // because it can be patched on the fly by make_non_entrant. The stack bang
1327  // instruction fits that requirement.
1328
1329  // Generate stack overflow check
1330
1331  if (UseStackBanging) {
1332    __ bang_stack_with_offset(StackShadowPages*os::vm_page_size());
1333  } else {
1334    // need a 5 byte instruction to allow MT safe patching to non-entrant
1335    __ fat_nop();
1336  }
1337
1338  // Generate a new frame for the wrapper.
1339  __ enter();
1340  // -2 because return address is already present and so is saved rbp
1341  __ subptr(rsp, stack_size - 2*wordSize);
1342
1343    // Frame is now completed as far as size and linkage.
1344
1345    int frame_complete = ((intptr_t)__ pc()) - start;
1346
1347#ifdef ASSERT
1348    {
1349      Label L;
1350      __ mov(rax, rsp);
1351      __ andptr(rax, -16); // must be 16 byte boundry (see amd64 ABI)
1352      __ cmpptr(rax, rsp);
1353      __ jcc(Assembler::equal, L);
1354      __ stop("improperly aligned stack");
1355      __ bind(L);
1356    }
1357#endif /* ASSERT */
1358
1359
1360  // We use r14 as the oop handle for the receiver/klass
1361  // It is callee save so it survives the call to native
1362
1363  const Register oop_handle_reg = r14;
1364
1365
1366
1367  //
1368  // We immediately shuffle the arguments so that any vm call we have to
1369  // make from here on out (sync slow path, jvmti, etc.) we will have
1370  // captured the oops from our caller and have a valid oopMap for
1371  // them.
1372
1373  // -----------------
1374  // The Grand Shuffle
1375
1376  // The Java calling convention is either equal (linux) or denser (win64) than the
1377  // c calling convention. However the because of the jni_env argument the c calling
1378  // convention always has at least one more (and two for static) arguments than Java.
1379  // Therefore if we move the args from java -> c backwards then we will never have
1380  // a register->register conflict and we don't have to build a dependency graph
1381  // and figure out how to break any cycles.
1382  //
1383
1384  // Record esp-based slot for receiver on stack for non-static methods
1385  int receiver_offset = -1;
1386
1387  // This is a trick. We double the stack slots so we can claim
1388  // the oops in the caller's frame. Since we are sure to have
1389  // more args than the caller doubling is enough to make
1390  // sure we can capture all the incoming oop args from the
1391  // caller.
1392  //
1393  OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1394
1395  // Mark location of rbp (someday)
1396  // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1397
1398  // Use eax, ebx as temporaries during any memory-memory moves we have to do
1399  // All inbound args are referenced based on rbp and all outbound args via rsp.
1400
1401
1402#ifdef ASSERT
1403  bool reg_destroyed[RegisterImpl::number_of_registers];
1404  bool freg_destroyed[XMMRegisterImpl::number_of_registers];
1405  for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
1406    reg_destroyed[r] = false;
1407  }
1408  for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
1409    freg_destroyed[f] = false;
1410  }
1411
1412#endif /* ASSERT */
1413
1414
1415  int c_arg = total_c_args - 1;
1416  for ( int i = total_in_args - 1; i >= 0 ; i--, c_arg-- ) {
1417#ifdef ASSERT
1418    if (in_regs[i].first()->is_Register()) {
1419      assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1420    } else if (in_regs[i].first()->is_XMMRegister()) {
1421      assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1422    }
1423    if (out_regs[c_arg].first()->is_Register()) {
1424      reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1425    } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1426      freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1427    }
1428#endif /* ASSERT */
1429    switch (in_sig_bt[i]) {
1430      case T_ARRAY:
1431      case T_OBJECT:
1432        object_move(masm, map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1433                    ((i == 0) && (!is_static)),
1434                    &receiver_offset);
1435        break;
1436      case T_VOID:
1437        break;
1438
1439      case T_FLOAT:
1440        float_move(masm, in_regs[i], out_regs[c_arg]);
1441          break;
1442
1443      case T_DOUBLE:
1444        assert( i + 1 < total_in_args &&
1445                in_sig_bt[i + 1] == T_VOID &&
1446                out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1447        double_move(masm, in_regs[i], out_regs[c_arg]);
1448        break;
1449
1450      case T_LONG :
1451        long_move(masm, in_regs[i], out_regs[c_arg]);
1452        break;
1453
1454      case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1455
1456      default:
1457        move32_64(masm, in_regs[i], out_regs[c_arg]);
1458    }
1459  }
1460
1461  // point c_arg at the first arg that is already loaded in case we
1462  // need to spill before we call out
1463  c_arg++;
1464
1465  // Pre-load a static method's oop into r14.  Used both by locking code and
1466  // the normal JNI call code.
1467  if (method->is_static()) {
1468
1469    //  load oop into a register
1470    __ movoop(oop_handle_reg, JNIHandles::make_local(Klass::cast(method->method_holder())->java_mirror()));
1471
1472    // Now handlize the static class mirror it's known not-null.
1473    __ movptr(Address(rsp, klass_offset), oop_handle_reg);
1474    map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
1475
1476    // Now get the handle
1477    __ lea(oop_handle_reg, Address(rsp, klass_offset));
1478    // store the klass handle as second argument
1479    __ movptr(c_rarg1, oop_handle_reg);
1480    // and protect the arg if we must spill
1481    c_arg--;
1482  }
1483
1484  // Change state to native (we save the return address in the thread, since it might not
1485  // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
1486  // points into the right code segment. It does not have to be the correct return pc.
1487  // We use the same pc/oopMap repeatedly when we call out
1488
1489  intptr_t the_pc = (intptr_t) __ pc();
1490  oop_maps->add_gc_map(the_pc - start, map);
1491
1492  __ set_last_Java_frame(rsp, noreg, (address)the_pc);
1493
1494
1495  // We have all of the arguments setup at this point. We must not touch any register
1496  // argument registers at this point (what if we save/restore them there are no oop?
1497
1498  {
1499    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
1500    // protect the args we've loaded
1501    save_args(masm, total_c_args, c_arg, out_regs);
1502    __ movoop(c_rarg1, JNIHandles::make_local(method()));
1503    __ call_VM_leaf(
1504      CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
1505      r15_thread, c_rarg1);
1506    restore_args(masm, total_c_args, c_arg, out_regs);
1507  }
1508
1509  // Lock a synchronized method
1510
1511  // Register definitions used by locking and unlocking
1512
1513  const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
1514  const Register obj_reg  = rbx;  // Will contain the oop
1515  const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
1516  const Register old_hdr  = r13;  // value of old header at unlock time
1517
1518  Label slow_path_lock;
1519  Label lock_done;
1520
1521  if (method->is_synchronized()) {
1522
1523
1524    const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
1525
1526    // Get the handle (the 2nd argument)
1527    __ mov(oop_handle_reg, c_rarg1);
1528
1529    // Get address of the box
1530
1531    __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
1532
1533    // Load the oop from the handle
1534    __ movptr(obj_reg, Address(oop_handle_reg, 0));
1535
1536    if (UseBiasedLocking) {
1537      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, false, lock_done, &slow_path_lock);
1538    }
1539
1540    // Load immediate 1 into swap_reg %rax
1541    __ movl(swap_reg, 1);
1542
1543    // Load (object->mark() | 1) into swap_reg %rax
1544    __ orptr(swap_reg, Address(obj_reg, 0));
1545
1546    // Save (object->mark() | 1) into BasicLock's displaced header
1547    __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
1548
1549    if (os::is_MP()) {
1550      __ lock();
1551    }
1552
1553    // src -> dest iff dest == rax else rax <- dest
1554    __ cmpxchgptr(lock_reg, Address(obj_reg, 0));
1555    __ jcc(Assembler::equal, lock_done);
1556
1557    // Hmm should this move to the slow path code area???
1558
1559    // Test if the oopMark is an obvious stack pointer, i.e.,
1560    //  1) (mark & 3) == 0, and
1561    //  2) rsp <= mark < mark + os::pagesize()
1562    // These 3 tests can be done by evaluating the following
1563    // expression: ((mark - rsp) & (3 - os::vm_page_size())),
1564    // assuming both stack pointer and pagesize have their
1565    // least significant 2 bits clear.
1566    // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
1567
1568    __ subptr(swap_reg, rsp);
1569    __ andptr(swap_reg, 3 - os::vm_page_size());
1570
1571    // Save the test result, for recursive case, the result is zero
1572    __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
1573    __ jcc(Assembler::notEqual, slow_path_lock);
1574
1575    // Slow path will re-enter here
1576
1577    __ bind(lock_done);
1578  }
1579
1580
1581  // Finally just about ready to make the JNI call
1582
1583
1584  // get JNIEnv* which is first argument to native
1585
1586  __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
1587
1588  // Now set thread in native
1589  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
1590
1591  __ call(RuntimeAddress(method->native_function()));
1592
1593    // Either restore the MXCSR register after returning from the JNI Call
1594    // or verify that it wasn't changed.
1595    if (RestoreMXCSROnJNICalls) {
1596      __ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std()));
1597
1598    }
1599    else if (CheckJNICalls ) {
1600      __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::verify_mxcsr_entry())));
1601    }
1602
1603
1604  // Unpack native results.
1605  switch (ret_type) {
1606  case T_BOOLEAN: __ c2bool(rax);            break;
1607  case T_CHAR   : __ movzwl(rax, rax);      break;
1608  case T_BYTE   : __ sign_extend_byte (rax); break;
1609  case T_SHORT  : __ sign_extend_short(rax); break;
1610  case T_INT    : /* nothing to do */        break;
1611  case T_DOUBLE :
1612  case T_FLOAT  :
1613    // Result is in xmm0 we'll save as needed
1614    break;
1615  case T_ARRAY:                 // Really a handle
1616  case T_OBJECT:                // Really a handle
1617      break; // can't de-handlize until after safepoint check
1618  case T_VOID: break;
1619  case T_LONG: break;
1620  default       : ShouldNotReachHere();
1621  }
1622
1623  // Switch thread to "native transition" state before reading the synchronization state.
1624  // This additional state is necessary because reading and testing the synchronization
1625  // state is not atomic w.r.t. GC, as this scenario demonstrates:
1626  //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
1627  //     VM thread changes sync state to synchronizing and suspends threads for GC.
1628  //     Thread A is resumed to finish this native method, but doesn't block here since it
1629  //     didn't see any synchronization is progress, and escapes.
1630  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
1631
1632  if(os::is_MP()) {
1633    if (UseMembar) {
1634      // Force this write out before the read below
1635      __ membar(Assembler::Membar_mask_bits(
1636           Assembler::LoadLoad | Assembler::LoadStore |
1637           Assembler::StoreLoad | Assembler::StoreStore));
1638    } else {
1639      // Write serialization page so VM thread can do a pseudo remote membar.
1640      // We use the current thread pointer to calculate a thread specific
1641      // offset to write to within the page. This minimizes bus traffic
1642      // due to cache line collision.
1643      __ serialize_memory(r15_thread, rcx);
1644    }
1645  }
1646
1647
1648  // check for safepoint operation in progress and/or pending suspend requests
1649  {
1650    Label Continue;
1651
1652    __ cmp32(ExternalAddress((address)SafepointSynchronize::address_of_state()),
1653             SafepointSynchronize::_not_synchronized);
1654
1655    Label L;
1656    __ jcc(Assembler::notEqual, L);
1657    __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
1658    __ jcc(Assembler::equal, Continue);
1659    __ bind(L);
1660
1661    // Don't use call_VM as it will see a possible pending exception and forward it
1662    // and never return here preventing us from clearing _last_native_pc down below.
1663    // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
1664    // preserved and correspond to the bcp/locals pointers. So we do a runtime call
1665    // by hand.
1666    //
1667    save_native_result(masm, ret_type, stack_slots);
1668    __ mov(c_rarg0, r15_thread);
1669    __ mov(r12, rsp); // remember sp
1670    __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1671    __ andptr(rsp, -16); // align stack as required by ABI
1672    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
1673    __ mov(rsp, r12); // restore sp
1674    __ reinit_heapbase();
1675    // Restore any method result value
1676    restore_native_result(masm, ret_type, stack_slots);
1677    __ bind(Continue);
1678  }
1679
1680  // change thread state
1681  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
1682
1683  Label reguard;
1684  Label reguard_done;
1685  __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), JavaThread::stack_guard_yellow_disabled);
1686  __ jcc(Assembler::equal, reguard);
1687  __ bind(reguard_done);
1688
1689  // native result if any is live
1690
1691  // Unlock
1692  Label unlock_done;
1693  Label slow_path_unlock;
1694  if (method->is_synchronized()) {
1695
1696    // Get locked oop from the handle we passed to jni
1697    __ movptr(obj_reg, Address(oop_handle_reg, 0));
1698
1699    Label done;
1700
1701    if (UseBiasedLocking) {
1702      __ biased_locking_exit(obj_reg, old_hdr, done);
1703    }
1704
1705    // Simple recursive lock?
1706
1707    __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
1708    __ jcc(Assembler::equal, done);
1709
1710    // Must save rax if if it is live now because cmpxchg must use it
1711    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
1712      save_native_result(masm, ret_type, stack_slots);
1713    }
1714
1715
1716    // get address of the stack lock
1717    __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
1718    //  get old displaced header
1719    __ movptr(old_hdr, Address(rax, 0));
1720
1721    // Atomic swap old header if oop still contains the stack lock
1722    if (os::is_MP()) {
1723      __ lock();
1724    }
1725    __ cmpxchgptr(old_hdr, Address(obj_reg, 0));
1726    __ jcc(Assembler::notEqual, slow_path_unlock);
1727
1728    // slow path re-enters here
1729    __ bind(unlock_done);
1730    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
1731      restore_native_result(masm, ret_type, stack_slots);
1732    }
1733
1734    __ bind(done);
1735
1736  }
1737  {
1738    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
1739    save_native_result(masm, ret_type, stack_slots);
1740    __ movoop(c_rarg1, JNIHandles::make_local(method()));
1741    __ call_VM_leaf(
1742         CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
1743         r15_thread, c_rarg1);
1744    restore_native_result(masm, ret_type, stack_slots);
1745  }
1746
1747  __ reset_last_Java_frame(false, true);
1748
1749  // Unpack oop result
1750  if (ret_type == T_OBJECT || ret_type == T_ARRAY) {
1751      Label L;
1752      __ testptr(rax, rax);
1753      __ jcc(Assembler::zero, L);
1754      __ movptr(rax, Address(rax, 0));
1755      __ bind(L);
1756      __ verify_oop(rax);
1757  }
1758
1759  // reset handle block
1760  __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
1761  __ movptr(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
1762
1763  // pop our frame
1764
1765  __ leave();
1766
1767  // Any exception pending?
1768  __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
1769  __ jcc(Assembler::notEqual, exception_pending);
1770
1771  // Return
1772
1773  __ ret(0);
1774
1775  // Unexpected paths are out of line and go here
1776
1777  // forward the exception
1778  __ bind(exception_pending);
1779
1780  // and forward the exception
1781  __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1782
1783
1784  // Slow path locking & unlocking
1785  if (method->is_synchronized()) {
1786
1787    // BEGIN Slow path lock
1788    __ bind(slow_path_lock);
1789
1790    // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
1791    // args are (oop obj, BasicLock* lock, JavaThread* thread)
1792
1793    // protect the args we've loaded
1794    save_args(masm, total_c_args, c_arg, out_regs);
1795
1796    __ mov(c_rarg0, obj_reg);
1797    __ mov(c_rarg1, lock_reg);
1798    __ mov(c_rarg2, r15_thread);
1799
1800    // Not a leaf but we have last_Java_frame setup as we want
1801    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
1802    restore_args(masm, total_c_args, c_arg, out_regs);
1803
1804#ifdef ASSERT
1805    { Label L;
1806    __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
1807    __ jcc(Assembler::equal, L);
1808    __ stop("no pending exception allowed on exit from monitorenter");
1809    __ bind(L);
1810    }
1811#endif
1812    __ jmp(lock_done);
1813
1814    // END Slow path lock
1815
1816    // BEGIN Slow path unlock
1817    __ bind(slow_path_unlock);
1818
1819    // If we haven't already saved the native result we must save it now as xmm registers
1820    // are still exposed.
1821
1822    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
1823      save_native_result(masm, ret_type, stack_slots);
1824    }
1825
1826    __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
1827
1828    __ mov(c_rarg0, obj_reg);
1829    __ mov(r12, rsp); // remember sp
1830    __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1831    __ andptr(rsp, -16); // align stack as required by ABI
1832
1833    // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
1834    // NOTE that obj_reg == rbx currently
1835    __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
1836    __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
1837
1838    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
1839    __ mov(rsp, r12); // restore sp
1840    __ reinit_heapbase();
1841#ifdef ASSERT
1842    {
1843      Label L;
1844      __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
1845      __ jcc(Assembler::equal, L);
1846      __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
1847      __ bind(L);
1848    }
1849#endif /* ASSERT */
1850
1851    __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
1852
1853    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
1854      restore_native_result(masm, ret_type, stack_slots);
1855    }
1856    __ jmp(unlock_done);
1857
1858    // END Slow path unlock
1859
1860  } // synchronized
1861
1862  // SLOW PATH Reguard the stack if needed
1863
1864  __ bind(reguard);
1865  save_native_result(masm, ret_type, stack_slots);
1866  __ mov(r12, rsp); // remember sp
1867  __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1868  __ andptr(rsp, -16); // align stack as required by ABI
1869  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
1870  __ mov(rsp, r12); // restore sp
1871  __ reinit_heapbase();
1872  restore_native_result(masm, ret_type, stack_slots);
1873  // and continue
1874  __ jmp(reguard_done);
1875
1876
1877
1878  __ flush();
1879
1880  nmethod *nm = nmethod::new_native_nmethod(method,
1881                                            masm->code(),
1882                                            vep_offset,
1883                                            frame_complete,
1884                                            stack_slots / VMRegImpl::slots_per_word,
1885                                            (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
1886                                            in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
1887                                            oop_maps);
1888  return nm;
1889
1890}
1891
1892#ifdef HAVE_DTRACE_H
1893// ---------------------------------------------------------------------------
1894// Generate a dtrace nmethod for a given signature.  The method takes arguments
1895// in the Java compiled code convention, marshals them to the native
1896// abi and then leaves nops at the position you would expect to call a native
1897// function. When the probe is enabled the nops are replaced with a trap
1898// instruction that dtrace inserts and the trace will cause a notification
1899// to dtrace.
1900//
1901// The probes are only able to take primitive types and java/lang/String as
1902// arguments.  No other java types are allowed. Strings are converted to utf8
1903// strings so that from dtrace point of view java strings are converted to C
1904// strings. There is an arbitrary fixed limit on the total space that a method
1905// can use for converting the strings. (256 chars per string in the signature).
1906// So any java string larger then this is truncated.
1907
1908static int  fp_offset[ConcreteRegisterImpl::number_of_registers] = { 0 };
1909static bool offsets_initialized = false;
1910
1911
1912nmethod *SharedRuntime::generate_dtrace_nmethod(MacroAssembler *masm,
1913                                                methodHandle method) {
1914
1915
1916  // generate_dtrace_nmethod is guarded by a mutex so we are sure to
1917  // be single threaded in this method.
1918  assert(AdapterHandlerLibrary_lock->owned_by_self(), "must be");
1919
1920  if (!offsets_initialized) {
1921    fp_offset[c_rarg0->as_VMReg()->value()] = -1 * wordSize;
1922    fp_offset[c_rarg1->as_VMReg()->value()] = -2 * wordSize;
1923    fp_offset[c_rarg2->as_VMReg()->value()] = -3 * wordSize;
1924    fp_offset[c_rarg3->as_VMReg()->value()] = -4 * wordSize;
1925    fp_offset[c_rarg4->as_VMReg()->value()] = -5 * wordSize;
1926    fp_offset[c_rarg5->as_VMReg()->value()] = -6 * wordSize;
1927
1928    fp_offset[c_farg0->as_VMReg()->value()] = -7 * wordSize;
1929    fp_offset[c_farg1->as_VMReg()->value()] = -8 * wordSize;
1930    fp_offset[c_farg2->as_VMReg()->value()] = -9 * wordSize;
1931    fp_offset[c_farg3->as_VMReg()->value()] = -10 * wordSize;
1932    fp_offset[c_farg4->as_VMReg()->value()] = -11 * wordSize;
1933    fp_offset[c_farg5->as_VMReg()->value()] = -12 * wordSize;
1934    fp_offset[c_farg6->as_VMReg()->value()] = -13 * wordSize;
1935    fp_offset[c_farg7->as_VMReg()->value()] = -14 * wordSize;
1936
1937    offsets_initialized = true;
1938  }
1939  // Fill in the signature array, for the calling-convention call.
1940  int total_args_passed = method->size_of_parameters();
1941
1942  BasicType* in_sig_bt  = NEW_RESOURCE_ARRAY(BasicType, total_args_passed);
1943  VMRegPair  *in_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed);
1944
1945  // The signature we are going to use for the trap that dtrace will see
1946  // java/lang/String is converted. We drop "this" and any other object
1947  // is converted to NULL.  (A one-slot java/lang/Long object reference
1948  // is converted to a two-slot long, which is why we double the allocation).
1949  BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_args_passed * 2);
1950  VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed * 2);
1951
1952  int i=0;
1953  int total_strings = 0;
1954  int first_arg_to_pass = 0;
1955  int total_c_args = 0;
1956
1957  // Skip the receiver as dtrace doesn't want to see it
1958  if( !method->is_static() ) {
1959    in_sig_bt[i++] = T_OBJECT;
1960    first_arg_to_pass = 1;
1961  }
1962
1963  // We need to convert the java args to where a native (non-jni) function
1964  // would expect them. To figure out where they go we convert the java
1965  // signature to a C signature.
1966
1967  SignatureStream ss(method->signature());
1968  for ( ; !ss.at_return_type(); ss.next()) {
1969    BasicType bt = ss.type();
1970    in_sig_bt[i++] = bt;  // Collect remaining bits of signature
1971    out_sig_bt[total_c_args++] = bt;
1972    if( bt == T_OBJECT) {
1973      symbolOop s = ss.as_symbol_or_null();
1974      if (s == vmSymbols::java_lang_String()) {
1975        total_strings++;
1976        out_sig_bt[total_c_args-1] = T_ADDRESS;
1977      } else if (s == vmSymbols::java_lang_Boolean() ||
1978                 s == vmSymbols::java_lang_Character() ||
1979                 s == vmSymbols::java_lang_Byte() ||
1980                 s == vmSymbols::java_lang_Short() ||
1981                 s == vmSymbols::java_lang_Integer() ||
1982                 s == vmSymbols::java_lang_Float()) {
1983        out_sig_bt[total_c_args-1] = T_INT;
1984      } else if (s == vmSymbols::java_lang_Long() ||
1985                 s == vmSymbols::java_lang_Double()) {
1986        out_sig_bt[total_c_args-1] = T_LONG;
1987        out_sig_bt[total_c_args++] = T_VOID;
1988      }
1989    } else if ( bt == T_LONG || bt == T_DOUBLE ) {
1990      in_sig_bt[i++] = T_VOID;   // Longs & doubles take 2 Java slots
1991      // We convert double to long
1992      out_sig_bt[total_c_args-1] = T_LONG;
1993      out_sig_bt[total_c_args++] = T_VOID;
1994    } else if ( bt == T_FLOAT) {
1995      // We convert float to int
1996      out_sig_bt[total_c_args-1] = T_INT;
1997    }
1998  }
1999
2000  assert(i==total_args_passed, "validly parsed signature");
2001
2002  // Now get the compiled-Java layout as input arguments
2003  int comp_args_on_stack;
2004  comp_args_on_stack = SharedRuntime::java_calling_convention(
2005      in_sig_bt, in_regs, total_args_passed, false);
2006
2007  // Now figure out where the args must be stored and how much stack space
2008  // they require (neglecting out_preserve_stack_slots but space for storing
2009  // the 1st six register arguments). It's weird see int_stk_helper.
2010
2011  int out_arg_slots;
2012  out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2013
2014  // Calculate the total number of stack slots we will need.
2015
2016  // First count the abi requirement plus all of the outgoing args
2017  int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2018
2019  // Now space for the string(s) we must convert
2020  int* string_locs   = NEW_RESOURCE_ARRAY(int, total_strings + 1);
2021  for (i = 0; i < total_strings ; i++) {
2022    string_locs[i] = stack_slots;
2023    stack_slots += max_dtrace_string_size / VMRegImpl::stack_slot_size;
2024  }
2025
2026  // Plus the temps we might need to juggle register args
2027  // regs take two slots each
2028  stack_slots += (Argument::n_int_register_parameters_c +
2029                  Argument::n_float_register_parameters_c) * 2;
2030
2031
2032  // + 4 for return address (which we own) and saved rbp,
2033
2034  stack_slots += 4;
2035
2036  // Ok The space we have allocated will look like:
2037  //
2038  //
2039  // FP-> |                     |
2040  //      |---------------------|
2041  //      | string[n]           |
2042  //      |---------------------| <- string_locs[n]
2043  //      | string[n-1]         |
2044  //      |---------------------| <- string_locs[n-1]
2045  //      | ...                 |
2046  //      | ...                 |
2047  //      |---------------------| <- string_locs[1]
2048  //      | string[0]           |
2049  //      |---------------------| <- string_locs[0]
2050  //      | outbound memory     |
2051  //      | based arguments     |
2052  //      |                     |
2053  //      |---------------------|
2054  //      |                     |
2055  // SP-> | out_preserved_slots |
2056  //
2057  //
2058
2059  // Now compute actual number of stack words we need rounding to make
2060  // stack properly aligned.
2061  stack_slots = round_to(stack_slots, 4 * VMRegImpl::slots_per_word);
2062
2063  int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2064
2065  intptr_t start = (intptr_t)__ pc();
2066
2067  // First thing make an ic check to see if we should even be here
2068
2069  // We are free to use all registers as temps without saving them and
2070  // restoring them except rbp. rbp, is the only callee save register
2071  // as far as the interpreter and the compiler(s) are concerned.
2072
2073  const Register ic_reg = rax;
2074  const Register receiver = rcx;
2075  Label hit;
2076  Label exception_pending;
2077
2078
2079  __ verify_oop(receiver);
2080  __ cmpl(ic_reg, Address(receiver, oopDesc::klass_offset_in_bytes()));
2081  __ jcc(Assembler::equal, hit);
2082
2083  __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
2084
2085  // verified entry must be aligned for code patching.
2086  // and the first 5 bytes must be in the same cache line
2087  // if we align at 8 then we will be sure 5 bytes are in the same line
2088  __ align(8);
2089
2090  __ bind(hit);
2091
2092  int vep_offset = ((intptr_t)__ pc()) - start;
2093
2094
2095  // The instruction at the verified entry point must be 5 bytes or longer
2096  // because it can be patched on the fly by make_non_entrant. The stack bang
2097  // instruction fits that requirement.
2098
2099  // Generate stack overflow check
2100
2101  if (UseStackBanging) {
2102    if (stack_size <= StackShadowPages*os::vm_page_size()) {
2103      __ bang_stack_with_offset(StackShadowPages*os::vm_page_size());
2104    } else {
2105      __ movl(rax, stack_size);
2106      __ bang_stack_size(rax, rbx);
2107    }
2108  } else {
2109    // need a 5 byte instruction to allow MT safe patching to non-entrant
2110    __ fat_nop();
2111  }
2112
2113  assert(((uintptr_t)__ pc() - start - vep_offset) >= 5,
2114         "valid size for make_non_entrant");
2115
2116  // Generate a new frame for the wrapper.
2117  __ enter();
2118
2119  // -4 because return address is already present and so is saved rbp,
2120  if (stack_size - 2*wordSize != 0) {
2121    __ subq(rsp, stack_size - 2*wordSize);
2122  }
2123
2124  // Frame is now completed as far a size and linkage.
2125
2126  int frame_complete = ((intptr_t)__ pc()) - start;
2127
2128  int c_arg, j_arg;
2129
2130  // State of input register args
2131
2132  bool  live[ConcreteRegisterImpl::number_of_registers];
2133
2134  live[j_rarg0->as_VMReg()->value()] = false;
2135  live[j_rarg1->as_VMReg()->value()] = false;
2136  live[j_rarg2->as_VMReg()->value()] = false;
2137  live[j_rarg3->as_VMReg()->value()] = false;
2138  live[j_rarg4->as_VMReg()->value()] = false;
2139  live[j_rarg5->as_VMReg()->value()] = false;
2140
2141  live[j_farg0->as_VMReg()->value()] = false;
2142  live[j_farg1->as_VMReg()->value()] = false;
2143  live[j_farg2->as_VMReg()->value()] = false;
2144  live[j_farg3->as_VMReg()->value()] = false;
2145  live[j_farg4->as_VMReg()->value()] = false;
2146  live[j_farg5->as_VMReg()->value()] = false;
2147  live[j_farg6->as_VMReg()->value()] = false;
2148  live[j_farg7->as_VMReg()->value()] = false;
2149
2150
2151  bool rax_is_zero = false;
2152
2153  // All args (except strings) destined for the stack are moved first
2154  for (j_arg = first_arg_to_pass, c_arg = 0 ;
2155       j_arg < total_args_passed ; j_arg++, c_arg++ ) {
2156    VMRegPair src = in_regs[j_arg];
2157    VMRegPair dst = out_regs[c_arg];
2158
2159    // Get the real reg value or a dummy (rsp)
2160
2161    int src_reg = src.first()->is_reg() ?
2162                  src.first()->value() :
2163                  rsp->as_VMReg()->value();
2164
2165    bool useless =  in_sig_bt[j_arg] == T_ARRAY ||
2166                    (in_sig_bt[j_arg] == T_OBJECT &&
2167                     out_sig_bt[c_arg] != T_INT &&
2168                     out_sig_bt[c_arg] != T_ADDRESS &&
2169                     out_sig_bt[c_arg] != T_LONG);
2170
2171    live[src_reg] = !useless;
2172
2173    if (dst.first()->is_stack()) {
2174
2175      // Even though a string arg in a register is still live after this loop
2176      // after the string conversion loop (next) it will be dead so we take
2177      // advantage of that now for simpler code to manage live.
2178
2179      live[src_reg] = false;
2180      switch (in_sig_bt[j_arg]) {
2181
2182        case T_ARRAY:
2183        case T_OBJECT:
2184          {
2185            Address stack_dst(rsp, reg2offset_out(dst.first()));
2186
2187            if (out_sig_bt[c_arg] == T_INT || out_sig_bt[c_arg] == T_LONG) {
2188              // need to unbox a one-word value
2189              Register in_reg = rax;
2190              if ( src.first()->is_reg() ) {
2191                in_reg = src.first()->as_Register();
2192              } else {
2193                __ movq(rax, Address(rbp, reg2offset_in(src.first())));
2194                rax_is_zero = false;
2195              }
2196              Label skipUnbox;
2197              __ movptr(Address(rsp, reg2offset_out(dst.first())),
2198                        (int32_t)NULL_WORD);
2199              __ testq(in_reg, in_reg);
2200              __ jcc(Assembler::zero, skipUnbox);
2201
2202              BasicType bt = out_sig_bt[c_arg];
2203              int box_offset = java_lang_boxing_object::value_offset_in_bytes(bt);
2204              Address src1(in_reg, box_offset);
2205              if ( bt == T_LONG ) {
2206                __ movq(in_reg,  src1);
2207                __ movq(stack_dst, in_reg);
2208                assert(out_sig_bt[c_arg+1] == T_VOID, "must be");
2209                ++c_arg; // skip over T_VOID to keep the loop indices in sync
2210              } else {
2211                __ movl(in_reg,  src1);
2212                __ movl(stack_dst, in_reg);
2213              }
2214
2215              __ bind(skipUnbox);
2216            } else if (out_sig_bt[c_arg] != T_ADDRESS) {
2217              // Convert the arg to NULL
2218              if (!rax_is_zero) {
2219                __ xorq(rax, rax);
2220                rax_is_zero = true;
2221              }
2222              __ movq(stack_dst, rax);
2223            }
2224          }
2225          break;
2226
2227        case T_VOID:
2228          break;
2229
2230        case T_FLOAT:
2231          // This does the right thing since we know it is destined for the
2232          // stack
2233          float_move(masm, src, dst);
2234          break;
2235
2236        case T_DOUBLE:
2237          // This does the right thing since we know it is destined for the
2238          // stack
2239          double_move(masm, src, dst);
2240          break;
2241
2242        case T_LONG :
2243          long_move(masm, src, dst);
2244          break;
2245
2246        case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2247
2248        default:
2249          move32_64(masm, src, dst);
2250      }
2251    }
2252
2253  }
2254
2255  // If we have any strings we must store any register based arg to the stack
2256  // This includes any still live xmm registers too.
2257
2258  int sid = 0;
2259
2260  if (total_strings > 0 ) {
2261    for (j_arg = first_arg_to_pass, c_arg = 0 ;
2262         j_arg < total_args_passed ; j_arg++, c_arg++ ) {
2263      VMRegPair src = in_regs[j_arg];
2264      VMRegPair dst = out_regs[c_arg];
2265
2266      if (src.first()->is_reg()) {
2267        Address src_tmp(rbp, fp_offset[src.first()->value()]);
2268
2269        // string oops were left untouched by the previous loop even if the
2270        // eventual (converted) arg is destined for the stack so park them
2271        // away now (except for first)
2272
2273        if (out_sig_bt[c_arg] == T_ADDRESS) {
2274          Address utf8_addr = Address(
2275              rsp, string_locs[sid++] * VMRegImpl::stack_slot_size);
2276          if (sid != 1) {
2277            // The first string arg won't be killed until after the utf8
2278            // conversion
2279            __ movq(utf8_addr, src.first()->as_Register());
2280          }
2281        } else if (dst.first()->is_reg()) {
2282          if (in_sig_bt[j_arg] == T_FLOAT || in_sig_bt[j_arg] == T_DOUBLE) {
2283
2284            // Convert the xmm register to an int and store it in the reserved
2285            // location for the eventual c register arg
2286            XMMRegister f = src.first()->as_XMMRegister();
2287            if (in_sig_bt[j_arg] == T_FLOAT) {
2288              __ movflt(src_tmp, f);
2289            } else {
2290              __ movdbl(src_tmp, f);
2291            }
2292          } else {
2293            // If the arg is an oop type we don't support don't bother to store
2294            // it remember string was handled above.
2295            bool useless =  in_sig_bt[j_arg] == T_ARRAY ||
2296                            (in_sig_bt[j_arg] == T_OBJECT &&
2297                             out_sig_bt[c_arg] != T_INT &&
2298                             out_sig_bt[c_arg] != T_LONG);
2299
2300            if (!useless) {
2301              __ movq(src_tmp, src.first()->as_Register());
2302            }
2303          }
2304        }
2305      }
2306      if (in_sig_bt[j_arg] == T_OBJECT && out_sig_bt[c_arg] == T_LONG) {
2307        assert(out_sig_bt[c_arg+1] == T_VOID, "must be");
2308        ++c_arg; // skip over T_VOID to keep the loop indices in sync
2309      }
2310    }
2311
2312    // Now that the volatile registers are safe, convert all the strings
2313    sid = 0;
2314
2315    for (j_arg = first_arg_to_pass, c_arg = 0 ;
2316         j_arg < total_args_passed ; j_arg++, c_arg++ ) {
2317      if (out_sig_bt[c_arg] == T_ADDRESS) {
2318        // It's a string
2319        Address utf8_addr = Address(
2320            rsp, string_locs[sid++] * VMRegImpl::stack_slot_size);
2321        // The first string we find might still be in the original java arg
2322        // register
2323
2324        VMReg src = in_regs[j_arg].first();
2325
2326        // We will need to eventually save the final argument to the trap
2327        // in the von-volatile location dedicated to src. This is the offset
2328        // from fp we will use.
2329        int src_off = src->is_reg() ?
2330            fp_offset[src->value()] : reg2offset_in(src);
2331
2332        // This is where the argument will eventually reside
2333        VMRegPair dst = out_regs[c_arg];
2334
2335        if (src->is_reg()) {
2336          if (sid == 1) {
2337            __ movq(c_rarg0, src->as_Register());
2338          } else {
2339            __ movq(c_rarg0, utf8_addr);
2340          }
2341        } else {
2342          // arg is still in the original location
2343          __ movq(c_rarg0, Address(rbp, reg2offset_in(src)));
2344        }
2345        Label done, convert;
2346
2347        // see if the oop is NULL
2348        __ testq(c_rarg0, c_rarg0);
2349        __ jcc(Assembler::notEqual, convert);
2350
2351        if (dst.first()->is_reg()) {
2352          // Save the ptr to utf string in the origina src loc or the tmp
2353          // dedicated to it
2354          __ movq(Address(rbp, src_off), c_rarg0);
2355        } else {
2356          __ movq(Address(rsp, reg2offset_out(dst.first())), c_rarg0);
2357        }
2358        __ jmp(done);
2359
2360        __ bind(convert);
2361
2362        __ lea(c_rarg1, utf8_addr);
2363        if (dst.first()->is_reg()) {
2364          __ movq(Address(rbp, src_off), c_rarg1);
2365        } else {
2366          __ movq(Address(rsp, reg2offset_out(dst.first())), c_rarg1);
2367        }
2368        // And do the conversion
2369        __ call(RuntimeAddress(
2370                CAST_FROM_FN_PTR(address, SharedRuntime::get_utf)));
2371
2372        __ bind(done);
2373      }
2374      if (in_sig_bt[j_arg] == T_OBJECT && out_sig_bt[c_arg] == T_LONG) {
2375        assert(out_sig_bt[c_arg+1] == T_VOID, "must be");
2376        ++c_arg; // skip over T_VOID to keep the loop indices in sync
2377      }
2378    }
2379    // The get_utf call killed all the c_arg registers
2380    live[c_rarg0->as_VMReg()->value()] = false;
2381    live[c_rarg1->as_VMReg()->value()] = false;
2382    live[c_rarg2->as_VMReg()->value()] = false;
2383    live[c_rarg3->as_VMReg()->value()] = false;
2384    live[c_rarg4->as_VMReg()->value()] = false;
2385    live[c_rarg5->as_VMReg()->value()] = false;
2386
2387    live[c_farg0->as_VMReg()->value()] = false;
2388    live[c_farg1->as_VMReg()->value()] = false;
2389    live[c_farg2->as_VMReg()->value()] = false;
2390    live[c_farg3->as_VMReg()->value()] = false;
2391    live[c_farg4->as_VMReg()->value()] = false;
2392    live[c_farg5->as_VMReg()->value()] = false;
2393    live[c_farg6->as_VMReg()->value()] = false;
2394    live[c_farg7->as_VMReg()->value()] = false;
2395  }
2396
2397  // Now we can finally move the register args to their desired locations
2398
2399  rax_is_zero = false;
2400
2401  for (j_arg = first_arg_to_pass, c_arg = 0 ;
2402       j_arg < total_args_passed ; j_arg++, c_arg++ ) {
2403
2404    VMRegPair src = in_regs[j_arg];
2405    VMRegPair dst = out_regs[c_arg];
2406
2407    // Only need to look for args destined for the interger registers (since we
2408    // convert float/double args to look like int/long outbound)
2409    if (dst.first()->is_reg()) {
2410      Register r =  dst.first()->as_Register();
2411
2412      // Check if the java arg is unsupported and thereofre useless
2413      bool useless =  in_sig_bt[j_arg] == T_ARRAY ||
2414                      (in_sig_bt[j_arg] == T_OBJECT &&
2415                       out_sig_bt[c_arg] != T_INT &&
2416                       out_sig_bt[c_arg] != T_ADDRESS &&
2417                       out_sig_bt[c_arg] != T_LONG);
2418
2419
2420      // If we're going to kill an existing arg save it first
2421      if (live[dst.first()->value()]) {
2422        // you can't kill yourself
2423        if (src.first() != dst.first()) {
2424          __ movq(Address(rbp, fp_offset[dst.first()->value()]), r);
2425        }
2426      }
2427      if (src.first()->is_reg()) {
2428        if (live[src.first()->value()] ) {
2429          if (in_sig_bt[j_arg] == T_FLOAT) {
2430            __ movdl(r, src.first()->as_XMMRegister());
2431          } else if (in_sig_bt[j_arg] == T_DOUBLE) {
2432            __ movdq(r, src.first()->as_XMMRegister());
2433          } else if (r != src.first()->as_Register()) {
2434            if (!useless) {
2435              __ movq(r, src.first()->as_Register());
2436            }
2437          }
2438        } else {
2439          // If the arg is an oop type we don't support don't bother to store
2440          // it
2441          if (!useless) {
2442            if (in_sig_bt[j_arg] == T_DOUBLE ||
2443                in_sig_bt[j_arg] == T_LONG  ||
2444                in_sig_bt[j_arg] == T_OBJECT ) {
2445              __ movq(r, Address(rbp, fp_offset[src.first()->value()]));
2446            } else {
2447              __ movl(r, Address(rbp, fp_offset[src.first()->value()]));
2448            }
2449          }
2450        }
2451        live[src.first()->value()] = false;
2452      } else if (!useless) {
2453        // full sized move even for int should be ok
2454        __ movq(r, Address(rbp, reg2offset_in(src.first())));
2455      }
2456
2457      // At this point r has the original java arg in the final location
2458      // (assuming it wasn't useless). If the java arg was an oop
2459      // we have a bit more to do
2460
2461      if (in_sig_bt[j_arg] == T_ARRAY || in_sig_bt[j_arg] == T_OBJECT ) {
2462        if (out_sig_bt[c_arg] == T_INT || out_sig_bt[c_arg] == T_LONG) {
2463          // need to unbox a one-word value
2464          Label skip;
2465          __ testq(r, r);
2466          __ jcc(Assembler::equal, skip);
2467          BasicType bt = out_sig_bt[c_arg];
2468          int box_offset = java_lang_boxing_object::value_offset_in_bytes(bt);
2469          Address src1(r, box_offset);
2470          if ( bt == T_LONG ) {
2471            __ movq(r, src1);
2472          } else {
2473            __ movl(r, src1);
2474          }
2475          __ bind(skip);
2476
2477        } else if (out_sig_bt[c_arg] != T_ADDRESS) {
2478          // Convert the arg to NULL
2479          __ xorq(r, r);
2480        }
2481      }
2482
2483      // dst can longer be holding an input value
2484      live[dst.first()->value()] = false;
2485    }
2486    if (in_sig_bt[j_arg] == T_OBJECT && out_sig_bt[c_arg] == T_LONG) {
2487      assert(out_sig_bt[c_arg+1] == T_VOID, "must be");
2488      ++c_arg; // skip over T_VOID to keep the loop indices in sync
2489    }
2490  }
2491
2492
2493  // Ok now we are done. Need to place the nop that dtrace wants in order to
2494  // patch in the trap
2495  int patch_offset = ((intptr_t)__ pc()) - start;
2496
2497  __ nop();
2498
2499
2500  // Return
2501
2502  __ leave();
2503  __ ret(0);
2504
2505  __ flush();
2506
2507  nmethod *nm = nmethod::new_dtrace_nmethod(
2508      method, masm->code(), vep_offset, patch_offset, frame_complete,
2509      stack_slots / VMRegImpl::slots_per_word);
2510  return nm;
2511
2512}
2513
2514#endif // HAVE_DTRACE_H
2515
2516// this function returns the adjust size (in number of words) to a c2i adapter
2517// activation for use during deoptimization
2518int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2519  return (callee_locals - callee_parameters) * Interpreter::stackElementWords();
2520}
2521
2522
2523uint SharedRuntime::out_preserve_stack_slots() {
2524  return 0;
2525}
2526
2527
2528//------------------------------generate_deopt_blob----------------------------
2529void SharedRuntime::generate_deopt_blob() {
2530  // Allocate space for the code
2531  ResourceMark rm;
2532  // Setup code generation tools
2533  CodeBuffer buffer("deopt_blob", 2048, 1024);
2534  MacroAssembler* masm = new MacroAssembler(&buffer);
2535  int frame_size_in_words;
2536  OopMap* map = NULL;
2537  OopMapSet *oop_maps = new OopMapSet();
2538
2539  // -------------
2540  // This code enters when returning to a de-optimized nmethod.  A return
2541  // address has been pushed on the the stack, and return values are in
2542  // registers.
2543  // If we are doing a normal deopt then we were called from the patched
2544  // nmethod from the point we returned to the nmethod. So the return
2545  // address on the stack is wrong by NativeCall::instruction_size
2546  // We will adjust the value so it looks like we have the original return
2547  // address on the stack (like when we eagerly deoptimized).
2548  // In the case of an exception pending when deoptimizing, we enter
2549  // with a return address on the stack that points after the call we patched
2550  // into the exception handler. We have the following register state from,
2551  // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2552  //    rax: exception oop
2553  //    rbx: exception handler
2554  //    rdx: throwing pc
2555  // So in this case we simply jam rdx into the useless return address and
2556  // the stack looks just like we want.
2557  //
2558  // At this point we need to de-opt.  We save the argument return
2559  // registers.  We call the first C routine, fetch_unroll_info().  This
2560  // routine captures the return values and returns a structure which
2561  // describes the current frame size and the sizes of all replacement frames.
2562  // The current frame is compiled code and may contain many inlined
2563  // functions, each with their own JVM state.  We pop the current frame, then
2564  // push all the new frames.  Then we call the C routine unpack_frames() to
2565  // populate these frames.  Finally unpack_frames() returns us the new target
2566  // address.  Notice that callee-save registers are BLOWN here; they have
2567  // already been captured in the vframeArray at the time the return PC was
2568  // patched.
2569  address start = __ pc();
2570  Label cont;
2571
2572  // Prolog for non exception case!
2573
2574  // Save everything in sight.
2575  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
2576
2577  // Normal deoptimization.  Save exec mode for unpack_frames.
2578  __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2579  __ jmp(cont);
2580
2581  int reexecute_offset = __ pc() - start;
2582
2583  // Reexecute case
2584  // return address is the pc describes what bci to do re-execute at
2585
2586  // No need to update map as each call to save_live_registers will produce identical oopmap
2587  (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
2588
2589  __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2590  __ jmp(cont);
2591
2592  int exception_offset = __ pc() - start;
2593
2594  // Prolog for exception case
2595
2596  // all registers are dead at this entry point, except for rax, and
2597  // rdx which contain the exception oop and exception pc
2598  // respectively.  Set them in TLS and fall thru to the
2599  // unpack_with_exception_in_tls entry point.
2600
2601  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2602  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2603
2604  int exception_in_tls_offset = __ pc() - start;
2605
2606  // new implementation because exception oop is now passed in JavaThread
2607
2608  // Prolog for exception case
2609  // All registers must be preserved because they might be used by LinearScan
2610  // Exceptiop oop and throwing PC are passed in JavaThread
2611  // tos: stack at point of call to method that threw the exception (i.e. only
2612  // args are on the stack, no return address)
2613
2614  // make room on stack for the return address
2615  // It will be patched later with the throwing pc. The correct value is not
2616  // available now because loading it from memory would destroy registers.
2617  __ push(0);
2618
2619  // Save everything in sight.
2620  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
2621
2622  // Now it is safe to overwrite any register
2623
2624  // Deopt during an exception.  Save exec mode for unpack_frames.
2625  __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2626
2627  // load throwing pc from JavaThread and patch it as the return address
2628  // of the current frame. Then clear the field in JavaThread
2629
2630  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2631  __ movptr(Address(rbp, wordSize), rdx);
2632  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2633
2634#ifdef ASSERT
2635  // verify that there is really an exception oop in JavaThread
2636  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2637  __ verify_oop(rax);
2638
2639  // verify that there is no pending exception
2640  Label no_pending_exception;
2641  __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2642  __ testptr(rax, rax);
2643  __ jcc(Assembler::zero, no_pending_exception);
2644  __ stop("must not have pending exception here");
2645  __ bind(no_pending_exception);
2646#endif
2647
2648  __ bind(cont);
2649
2650  // Call C code.  Need thread and this frame, but NOT official VM entry
2651  // crud.  We cannot block on this call, no GC can happen.
2652  //
2653  // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2654
2655  // fetch_unroll_info needs to call last_java_frame().
2656
2657  __ set_last_Java_frame(noreg, noreg, NULL);
2658#ifdef ASSERT
2659  { Label L;
2660    __ cmpptr(Address(r15_thread,
2661                    JavaThread::last_Java_fp_offset()),
2662            (int32_t)0);
2663    __ jcc(Assembler::equal, L);
2664    __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2665    __ bind(L);
2666  }
2667#endif // ASSERT
2668  __ mov(c_rarg0, r15_thread);
2669  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2670
2671  // Need to have an oopmap that tells fetch_unroll_info where to
2672  // find any register it might need.
2673  oop_maps->add_gc_map(__ pc() - start, map);
2674
2675  __ reset_last_Java_frame(false, false);
2676
2677  // Load UnrollBlock* into rdi
2678  __ mov(rdi, rax);
2679
2680   Label noException;
2681  __ cmpl(r12, Deoptimization::Unpack_exception);   // Was exception pending?
2682  __ jcc(Assembler::notEqual, noException);
2683  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2684  // QQQ this is useless it was NULL above
2685  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2686  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2687  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2688
2689  __ verify_oop(rax);
2690
2691  // Overwrite the result registers with the exception results.
2692  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2693  // I think this is useless
2694  __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2695
2696  __ bind(noException);
2697
2698  // Only register save data is on the stack.
2699  // Now restore the result registers.  Everything else is either dead
2700  // or captured in the vframeArray.
2701  RegisterSaver::restore_result_registers(masm);
2702
2703  // All of the register save area has been popped of the stack. Only the
2704  // return address remains.
2705
2706  // Pop all the frames we must move/replace.
2707  //
2708  // Frame picture (youngest to oldest)
2709  // 1: self-frame (no frame link)
2710  // 2: deopting frame  (no frame link)
2711  // 3: caller of deopting frame (could be compiled/interpreted).
2712  //
2713  // Note: by leaving the return address of self-frame on the stack
2714  // and using the size of frame 2 to adjust the stack
2715  // when we are done the return to frame 3 will still be on the stack.
2716
2717  // Pop deoptimized frame
2718  __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2719  __ addptr(rsp, rcx);
2720
2721  // rsp should be pointing at the return address to the caller (3)
2722
2723  // Stack bang to make sure there's enough room for these interpreter frames.
2724  if (UseStackBanging) {
2725    __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2726    __ bang_stack_size(rbx, rcx);
2727  }
2728
2729  // Load address of array of frame pcs into rcx
2730  __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2731
2732  // Trash the old pc
2733  __ addptr(rsp, wordSize);
2734
2735  // Load address of array of frame sizes into rsi
2736  __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2737
2738  // Load counter into rdx
2739  __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2740
2741  // Pick up the initial fp we should save
2742  __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_fp_offset_in_bytes()));
2743
2744  // Now adjust the caller's stack to make up for the extra locals
2745  // but record the original sp so that we can save it in the skeletal interpreter
2746  // frame and the stack walking of interpreter_sender will get the unextended sp
2747  // value and not the "real" sp value.
2748
2749  const Register sender_sp = r8;
2750
2751  __ mov(sender_sp, rsp);
2752  __ movl(rbx, Address(rdi,
2753                       Deoptimization::UnrollBlock::
2754                       caller_adjustment_offset_in_bytes()));
2755  __ subptr(rsp, rbx);
2756
2757  // Push interpreter frames in a loop
2758  Label loop;
2759  __ bind(loop);
2760  __ movptr(rbx, Address(rsi, 0));      // Load frame size
2761#ifdef CC_INTERP
2762  __ subptr(rbx, 4*wordSize);           // we'll push pc and ebp by hand and
2763#ifdef ASSERT
2764  __ push(0xDEADDEAD);                  // Make a recognizable pattern
2765  __ push(0xDEADDEAD);
2766#else /* ASSERT */
2767  __ subptr(rsp, 2*wordSize);           // skip the "static long no_param"
2768#endif /* ASSERT */
2769#else
2770  __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2771#endif // CC_INTERP
2772  __ pushptr(Address(rcx, 0));          // Save return address
2773  __ enter();                           // Save old & set new ebp
2774  __ subptr(rsp, rbx);                  // Prolog
2775#ifdef CC_INTERP
2776  __ movptr(Address(rbp,
2777                  -(sizeof(BytecodeInterpreter)) + in_bytes(byte_offset_of(BytecodeInterpreter, _sender_sp))),
2778            sender_sp); // Make it walkable
2779#else /* CC_INTERP */
2780  // This value is corrected by layout_activation_impl
2781  __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2782  __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2783#endif /* CC_INTERP */
2784  __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2785  __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2786  __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2787  __ decrementl(rdx);                   // Decrement counter
2788  __ jcc(Assembler::notZero, loop);
2789  __ pushptr(Address(rcx, 0));          // Save final return address
2790
2791  // Re-push self-frame
2792  __ enter();                           // Save old & set new ebp
2793
2794  // Allocate a full sized register save area.
2795  // Return address and rbp are in place, so we allocate two less words.
2796  __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2797
2798  // Restore frame locals after moving the frame
2799  __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2800  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2801
2802  // Call C code.  Need thread but NOT official VM entry
2803  // crud.  We cannot block on this call, no GC can happen.  Call should
2804  // restore return values to their stack-slots with the new SP.
2805  //
2806  // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2807
2808  // Use rbp because the frames look interpreted now
2809  __ set_last_Java_frame(noreg, rbp, NULL);
2810
2811  __ mov(c_rarg0, r15_thread);
2812  __ movl(c_rarg1, r14); // second arg: exec_mode
2813  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2814
2815  // Set an oopmap for the call site
2816  oop_maps->add_gc_map(__ pc() - start,
2817                       new OopMap( frame_size_in_words, 0 ));
2818
2819  __ reset_last_Java_frame(true, false);
2820
2821  // Collect return values
2822  __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2823  __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2824  // I think this is useless (throwing pc?)
2825  __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2826
2827  // Pop self-frame.
2828  __ leave();                           // Epilog
2829
2830  // Jump to interpreter
2831  __ ret(0);
2832
2833  // Make sure all code is generated
2834  masm->flush();
2835
2836  _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2837  _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2838}
2839
2840#ifdef COMPILER2
2841//------------------------------generate_uncommon_trap_blob--------------------
2842void SharedRuntime::generate_uncommon_trap_blob() {
2843  // Allocate space for the code
2844  ResourceMark rm;
2845  // Setup code generation tools
2846  CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2847  MacroAssembler* masm = new MacroAssembler(&buffer);
2848
2849  assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2850
2851  address start = __ pc();
2852
2853  // Push self-frame.  We get here with a return address on the
2854  // stack, so rsp is 8-byte aligned until we allocate our frame.
2855  __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2856
2857  // No callee saved registers. rbp is assumed implicitly saved
2858  __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2859
2860  // compiler left unloaded_class_index in j_rarg0 move to where the
2861  // runtime expects it.
2862  __ movl(c_rarg1, j_rarg0);
2863
2864  __ set_last_Java_frame(noreg, noreg, NULL);
2865
2866  // Call C code.  Need thread but NOT official VM entry
2867  // crud.  We cannot block on this call, no GC can happen.  Call should
2868  // capture callee-saved registers as well as return values.
2869  // Thread is in rdi already.
2870  //
2871  // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2872
2873  __ mov(c_rarg0, r15_thread);
2874  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2875
2876  // Set an oopmap for the call site
2877  OopMapSet* oop_maps = new OopMapSet();
2878  OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2879
2880  // location of rbp is known implicitly by the frame sender code
2881
2882  oop_maps->add_gc_map(__ pc() - start, map);
2883
2884  __ reset_last_Java_frame(false, false);
2885
2886  // Load UnrollBlock* into rdi
2887  __ mov(rdi, rax);
2888
2889  // Pop all the frames we must move/replace.
2890  //
2891  // Frame picture (youngest to oldest)
2892  // 1: self-frame (no frame link)
2893  // 2: deopting frame  (no frame link)
2894  // 3: caller of deopting frame (could be compiled/interpreted).
2895
2896  // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2897  __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2898
2899  // Pop deoptimized frame (int)
2900  __ movl(rcx, Address(rdi,
2901                       Deoptimization::UnrollBlock::
2902                       size_of_deoptimized_frame_offset_in_bytes()));
2903  __ addptr(rsp, rcx);
2904
2905  // rsp should be pointing at the return address to the caller (3)
2906
2907  // Stack bang to make sure there's enough room for these interpreter frames.
2908  if (UseStackBanging) {
2909    __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2910    __ bang_stack_size(rbx, rcx);
2911  }
2912
2913  // Load address of array of frame pcs into rcx (address*)
2914  __ movptr(rcx,
2915            Address(rdi,
2916                    Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2917
2918  // Trash the return pc
2919  __ addptr(rsp, wordSize);
2920
2921  // Load address of array of frame sizes into rsi (intptr_t*)
2922  __ movptr(rsi, Address(rdi,
2923                         Deoptimization::UnrollBlock::
2924                         frame_sizes_offset_in_bytes()));
2925
2926  // Counter
2927  __ movl(rdx, Address(rdi,
2928                       Deoptimization::UnrollBlock::
2929                       number_of_frames_offset_in_bytes())); // (int)
2930
2931  // Pick up the initial fp we should save
2932  __ movptr(rbp,
2933            Address(rdi,
2934                    Deoptimization::UnrollBlock::initial_fp_offset_in_bytes()));
2935
2936  // Now adjust the caller's stack to make up for the extra locals but
2937  // record the original sp so that we can save it in the skeletal
2938  // interpreter frame and the stack walking of interpreter_sender
2939  // will get the unextended sp value and not the "real" sp value.
2940
2941  const Register sender_sp = r8;
2942
2943  __ mov(sender_sp, rsp);
2944  __ movl(rbx, Address(rdi,
2945                       Deoptimization::UnrollBlock::
2946                       caller_adjustment_offset_in_bytes())); // (int)
2947  __ subptr(rsp, rbx);
2948
2949  // Push interpreter frames in a loop
2950  Label loop;
2951  __ bind(loop);
2952  __ movptr(rbx, Address(rsi, 0)); // Load frame size
2953  __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2954  __ pushptr(Address(rcx, 0));     // Save return address
2955  __ enter();                      // Save old & set new rbp
2956  __ subptr(rsp, rbx);             // Prolog
2957  __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2958            sender_sp);            // Make it walkable
2959  // This value is corrected by layout_activation_impl
2960  __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2961  __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
2962  __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
2963  __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
2964  __ decrementl(rdx);              // Decrement counter
2965  __ jcc(Assembler::notZero, loop);
2966  __ pushptr(Address(rcx, 0));     // Save final return address
2967
2968  // Re-push self-frame
2969  __ enter();                 // Save old & set new rbp
2970  __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2971                              // Prolog
2972
2973  // Use rbp because the frames look interpreted now
2974  __ set_last_Java_frame(noreg, rbp, NULL);
2975
2976  // Call C code.  Need thread but NOT official VM entry
2977  // crud.  We cannot block on this call, no GC can happen.  Call should
2978  // restore return values to their stack-slots with the new SP.
2979  // Thread is in rdi already.
2980  //
2981  // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2982
2983  __ mov(c_rarg0, r15_thread);
2984  __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2985  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2986
2987  // Set an oopmap for the call site
2988  oop_maps->add_gc_map(__ pc() - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
2989
2990  __ reset_last_Java_frame(true, false);
2991
2992  // Pop self-frame.
2993  __ leave();                 // Epilog
2994
2995  // Jump to interpreter
2996  __ ret(0);
2997
2998  // Make sure all code is generated
2999  masm->flush();
3000
3001  _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3002                                                 SimpleRuntimeFrame::framesize >> 1);
3003}
3004#endif // COMPILER2
3005
3006
3007//------------------------------generate_handler_blob------
3008//
3009// Generate a special Compile2Runtime blob that saves all registers,
3010// and setup oopmap.
3011//
3012static SafepointBlob* generate_handler_blob(address call_ptr, bool cause_return) {
3013  assert(StubRoutines::forward_exception_entry() != NULL,
3014         "must be generated before");
3015
3016  ResourceMark rm;
3017  OopMapSet *oop_maps = new OopMapSet();
3018  OopMap* map;
3019
3020  // Allocate space for the code.  Setup code generation tools.
3021  CodeBuffer buffer("handler_blob", 2048, 1024);
3022  MacroAssembler* masm = new MacroAssembler(&buffer);
3023
3024  address start   = __ pc();
3025  address call_pc = NULL;
3026  int frame_size_in_words;
3027
3028  // Make room for return address (or push it again)
3029  if (!cause_return) {
3030    __ push(rbx);
3031  }
3032
3033  // Save registers, fpu state, and flags
3034  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
3035
3036  // The following is basically a call_VM.  However, we need the precise
3037  // address of the call in order to generate an oopmap. Hence, we do all the
3038  // work outselves.
3039
3040  __ set_last_Java_frame(noreg, noreg, NULL);
3041
3042  // The return address must always be correct so that frame constructor never
3043  // sees an invalid pc.
3044
3045  if (!cause_return) {
3046    // overwrite the dummy value we pushed on entry
3047    __ movptr(c_rarg0, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3048    __ movptr(Address(rbp, wordSize), c_rarg0);
3049  }
3050
3051  // Do the call
3052  __ mov(c_rarg0, r15_thread);
3053  __ call(RuntimeAddress(call_ptr));
3054
3055  // Set an oopmap for the call site.  This oopmap will map all
3056  // oop-registers and debug-info registers as callee-saved.  This
3057  // will allow deoptimization at this safepoint to find all possible
3058  // debug-info recordings, as well as let GC find all oops.
3059
3060  oop_maps->add_gc_map( __ pc() - start, map);
3061
3062  Label noException;
3063
3064  __ reset_last_Java_frame(false, false);
3065
3066  __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3067  __ jcc(Assembler::equal, noException);
3068
3069  // Exception pending
3070
3071  RegisterSaver::restore_live_registers(masm);
3072
3073  __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3074
3075  // No exception case
3076  __ bind(noException);
3077
3078  // Normal exit, restore registers and exit.
3079  RegisterSaver::restore_live_registers(masm);
3080
3081  __ ret(0);
3082
3083  // Make sure all code is generated
3084  masm->flush();
3085
3086  // Fill-out other meta info
3087  return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3088}
3089
3090//
3091// generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3092//
3093// Generate a stub that calls into vm to find out the proper destination
3094// of a java call. All the argument registers are live at this point
3095// but since this is generic code we don't know what they are and the caller
3096// must do any gc of the args.
3097//
3098static RuntimeStub* generate_resolve_blob(address destination, const char* name) {
3099  assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3100
3101  // allocate space for the code
3102  ResourceMark rm;
3103
3104  CodeBuffer buffer(name, 1000, 512);
3105  MacroAssembler* masm                = new MacroAssembler(&buffer);
3106
3107  int frame_size_in_words;
3108
3109  OopMapSet *oop_maps = new OopMapSet();
3110  OopMap* map = NULL;
3111
3112  int start = __ offset();
3113
3114  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
3115
3116  int frame_complete = __ offset();
3117
3118  __ set_last_Java_frame(noreg, noreg, NULL);
3119
3120  __ mov(c_rarg0, r15_thread);
3121
3122  __ call(RuntimeAddress(destination));
3123
3124
3125  // Set an oopmap for the call site.
3126  // We need this not only for callee-saved registers, but also for volatile
3127  // registers that the compiler might be keeping live across a safepoint.
3128
3129  oop_maps->add_gc_map( __ offset() - start, map);
3130
3131  // rax contains the address we are going to jump to assuming no exception got installed
3132
3133  // clear last_Java_sp
3134  __ reset_last_Java_frame(false, false);
3135  // check for pending exceptions
3136  Label pending;
3137  __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3138  __ jcc(Assembler::notEqual, pending);
3139
3140  // get the returned methodOop
3141  __ movptr(rbx, Address(r15_thread, JavaThread::vm_result_offset()));
3142  __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3143
3144  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3145
3146  RegisterSaver::restore_live_registers(masm);
3147
3148  // We are back the the original state on entry and ready to go.
3149
3150  __ jmp(rax);
3151
3152  // Pending exception after the safepoint
3153
3154  __ bind(pending);
3155
3156  RegisterSaver::restore_live_registers(masm);
3157
3158  // exception pending => remove activation and forward to exception handler
3159
3160  __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3161
3162  __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3163  __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3164
3165  // -------------
3166  // make sure all code is generated
3167  masm->flush();
3168
3169  // return the  blob
3170  // frame_size_words or bytes??
3171  return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3172}
3173
3174
3175void SharedRuntime::generate_stubs() {
3176
3177  _wrong_method_blob = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::handle_wrong_method),
3178                                        "wrong_method_stub");
3179  _ic_miss_blob =      generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::handle_wrong_method_ic_miss),
3180                                        "ic_miss_stub");
3181  _resolve_opt_virtual_call_blob = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_opt_virtual_call_C),
3182                                        "resolve_opt_virtual_call");
3183
3184  _resolve_virtual_call_blob = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_virtual_call_C),
3185                                        "resolve_virtual_call");
3186
3187  _resolve_static_call_blob = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_static_call_C),
3188                                        "resolve_static_call");
3189  _polling_page_safepoint_handler_blob =
3190    generate_handler_blob(CAST_FROM_FN_PTR(address,
3191                   SafepointSynchronize::handle_polling_page_exception), false);
3192
3193  _polling_page_return_handler_blob =
3194    generate_handler_blob(CAST_FROM_FN_PTR(address,
3195                   SafepointSynchronize::handle_polling_page_exception), true);
3196
3197  generate_deopt_blob();
3198
3199#ifdef COMPILER2
3200  generate_uncommon_trap_blob();
3201#endif // COMPILER2
3202}
3203
3204
3205#ifdef COMPILER2
3206// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3207//
3208//------------------------------generate_exception_blob---------------------------
3209// creates exception blob at the end
3210// Using exception blob, this code is jumped from a compiled method.
3211// (see emit_exception_handler in x86_64.ad file)
3212//
3213// Given an exception pc at a call we call into the runtime for the
3214// handler in this method. This handler might merely restore state
3215// (i.e. callee save registers) unwind the frame and jump to the
3216// exception handler for the nmethod if there is no Java level handler
3217// for the nmethod.
3218//
3219// This code is entered with a jmp.
3220//
3221// Arguments:
3222//   rax: exception oop
3223//   rdx: exception pc
3224//
3225// Results:
3226//   rax: exception oop
3227//   rdx: exception pc in caller or ???
3228//   destination: exception handler of caller
3229//
3230// Note: the exception pc MUST be at a call (precise debug information)
3231//       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3232//
3233
3234void OptoRuntime::generate_exception_blob() {
3235  assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3236  assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3237  assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3238
3239  assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3240
3241  // Allocate space for the code
3242  ResourceMark rm;
3243  // Setup code generation tools
3244  CodeBuffer buffer("exception_blob", 2048, 1024);
3245  MacroAssembler* masm = new MacroAssembler(&buffer);
3246
3247
3248  address start = __ pc();
3249
3250  // Exception pc is 'return address' for stack walker
3251  __ push(rdx);
3252  __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3253
3254  // Save callee-saved registers.  See x86_64.ad.
3255
3256  // rbp is an implicitly saved callee saved register (i.e. the calling
3257  // convention will save restore it in prolog/epilog) Other than that
3258  // there are no callee save registers now that adapter frames are gone.
3259
3260  __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3261
3262  // Store exception in Thread object. We cannot pass any arguments to the
3263  // handle_exception call, since we do not want to make any assumption
3264  // about the size of the frame where the exception happened in.
3265  // c_rarg0 is either rdi (Linux) or rcx (Windows).
3266  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3267  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3268
3269  // This call does all the hard work.  It checks if an exception handler
3270  // exists in the method.
3271  // If so, it returns the handler address.
3272  // If not, it prepares for stack-unwinding, restoring the callee-save
3273  // registers of the frame being removed.
3274  //
3275  // address OptoRuntime::handle_exception_C(JavaThread* thread)
3276
3277  __ set_last_Java_frame(noreg, noreg, NULL);
3278  __ mov(c_rarg0, r15_thread);
3279  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3280
3281  // Set an oopmap for the call site.  This oopmap will only be used if we
3282  // are unwinding the stack.  Hence, all locations will be dead.
3283  // Callee-saved registers will be the same as the frame above (i.e.,
3284  // handle_exception_stub), since they were restored when we got the
3285  // exception.
3286
3287  OopMapSet* oop_maps = new OopMapSet();
3288
3289  oop_maps->add_gc_map( __ pc()-start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3290
3291  __ reset_last_Java_frame(false, false);
3292
3293  // Restore callee-saved registers
3294
3295  // rbp is an implicitly saved callee saved register (i.e. the calling
3296  // convention will save restore it in prolog/epilog) Other than that
3297  // there are no callee save registers no that adapter frames are gone.
3298
3299  __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3300
3301  __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3302  __ pop(rdx);                  // No need for exception pc anymore
3303
3304  // rax: exception handler
3305
3306  // We have a handler in rax (could be deopt blob).
3307  __ mov(r8, rax);
3308
3309  // Get the exception oop
3310  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3311  // Get the exception pc in case we are deoptimized
3312  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3313#ifdef ASSERT
3314  __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3315  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3316#endif
3317  // Clear the exception oop so GC no longer processes it as a root.
3318  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3319
3320  // rax: exception oop
3321  // r8:  exception handler
3322  // rdx: exception pc
3323  // Jump to handler
3324
3325  __ jmp(r8);
3326
3327  // Make sure all code is generated
3328  masm->flush();
3329
3330  // Set exception blob
3331  _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3332}
3333#endif // COMPILER2
3334