1/*
2 * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2012, 2017 SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26#include "precompiled.hpp"
27#include "asm/macroAssembler.inline.hpp"
28#include "code/debugInfoRec.hpp"
29#include "code/icBuffer.hpp"
30#include "code/vtableStubs.hpp"
31#include "frame_ppc.hpp"
32#include "interpreter/interpreter.hpp"
33#include "interpreter/interp_masm.hpp"
34#include "memory/resourceArea.hpp"
35#include "oops/compiledICHolder.hpp"
36#include "runtime/sharedRuntime.hpp"
37#include "runtime/vframeArray.hpp"
38#include "vmreg_ppc.inline.hpp"
39#ifdef COMPILER1
40#include "c1/c1_Runtime1.hpp"
41#endif
42#ifdef COMPILER2
43#include "opto/ad.hpp"
44#include "opto/runtime.hpp"
45#endif
46
47#include <alloca.h>
48
49#define __ masm->
50
51#ifdef PRODUCT
52#define BLOCK_COMMENT(str) // nothing
53#else
54#define BLOCK_COMMENT(str) __ block_comment(str)
55#endif
56
57#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
58
59
60class RegisterSaver {
61 // Used for saving volatile registers.
62 public:
63
64  // Support different return pc locations.
65  enum ReturnPCLocation {
66    return_pc_is_lr,
67    return_pc_is_pre_saved,
68    return_pc_is_thread_saved_exception_pc
69  };
70
71  static OopMap* push_frame_reg_args_and_save_live_registers(MacroAssembler* masm,
72                         int* out_frame_size_in_bytes,
73                         bool generate_oop_map,
74                         int return_pc_adjustment,
75                         ReturnPCLocation return_pc_location);
76  static void    restore_live_registers_and_pop_frame(MacroAssembler* masm,
77                         int frame_size_in_bytes,
78                         bool restore_ctr);
79
80  static void push_frame_and_save_argument_registers(MacroAssembler* masm,
81                         Register r_temp,
82                         int frame_size,
83                         int total_args,
84                         const VMRegPair *regs, const VMRegPair *regs2 = NULL);
85  static void restore_argument_registers_and_pop_frame(MacroAssembler*masm,
86                         int frame_size,
87                         int total_args,
88                         const VMRegPair *regs, const VMRegPair *regs2 = NULL);
89
90  // During deoptimization only the result registers need to be restored
91  // all the other values have already been extracted.
92  static void restore_result_registers(MacroAssembler* masm, int frame_size_in_bytes);
93
94  // Constants and data structures:
95
96  typedef enum {
97    int_reg           = 0,
98    float_reg         = 1,
99    special_reg       = 2
100  } RegisterType;
101
102  typedef enum {
103    reg_size          = 8,
104    half_reg_size     = reg_size / 2,
105  } RegisterConstants;
106
107  typedef struct {
108    RegisterType        reg_type;
109    int                 reg_num;
110    VMReg               vmreg;
111  } LiveRegType;
112};
113
114
115#define RegisterSaver_LiveSpecialReg(regname) \
116  { RegisterSaver::special_reg, regname->encoding(), regname->as_VMReg() }
117
118#define RegisterSaver_LiveIntReg(regname) \
119  { RegisterSaver::int_reg,     regname->encoding(), regname->as_VMReg() }
120
121#define RegisterSaver_LiveFloatReg(regname) \
122  { RegisterSaver::float_reg,   regname->encoding(), regname->as_VMReg() }
123
124static const RegisterSaver::LiveRegType RegisterSaver_LiveRegs[] = {
125  // Live registers which get spilled to the stack. Register
126  // positions in this array correspond directly to the stack layout.
127
128  //
129  // live special registers:
130  //
131  RegisterSaver_LiveSpecialReg(SR_CTR),
132  //
133  // live float registers:
134  //
135  RegisterSaver_LiveFloatReg( F0  ),
136  RegisterSaver_LiveFloatReg( F1  ),
137  RegisterSaver_LiveFloatReg( F2  ),
138  RegisterSaver_LiveFloatReg( F3  ),
139  RegisterSaver_LiveFloatReg( F4  ),
140  RegisterSaver_LiveFloatReg( F5  ),
141  RegisterSaver_LiveFloatReg( F6  ),
142  RegisterSaver_LiveFloatReg( F7  ),
143  RegisterSaver_LiveFloatReg( F8  ),
144  RegisterSaver_LiveFloatReg( F9  ),
145  RegisterSaver_LiveFloatReg( F10 ),
146  RegisterSaver_LiveFloatReg( F11 ),
147  RegisterSaver_LiveFloatReg( F12 ),
148  RegisterSaver_LiveFloatReg( F13 ),
149  RegisterSaver_LiveFloatReg( F14 ),
150  RegisterSaver_LiveFloatReg( F15 ),
151  RegisterSaver_LiveFloatReg( F16 ),
152  RegisterSaver_LiveFloatReg( F17 ),
153  RegisterSaver_LiveFloatReg( F18 ),
154  RegisterSaver_LiveFloatReg( F19 ),
155  RegisterSaver_LiveFloatReg( F20 ),
156  RegisterSaver_LiveFloatReg( F21 ),
157  RegisterSaver_LiveFloatReg( F22 ),
158  RegisterSaver_LiveFloatReg( F23 ),
159  RegisterSaver_LiveFloatReg( F24 ),
160  RegisterSaver_LiveFloatReg( F25 ),
161  RegisterSaver_LiveFloatReg( F26 ),
162  RegisterSaver_LiveFloatReg( F27 ),
163  RegisterSaver_LiveFloatReg( F28 ),
164  RegisterSaver_LiveFloatReg( F29 ),
165  RegisterSaver_LiveFloatReg( F30 ),
166  RegisterSaver_LiveFloatReg( F31 ),
167  //
168  // live integer registers:
169  //
170  RegisterSaver_LiveIntReg(   R0  ),
171  //RegisterSaver_LiveIntReg( R1  ), // stack pointer
172  RegisterSaver_LiveIntReg(   R2  ),
173  RegisterSaver_LiveIntReg(   R3  ),
174  RegisterSaver_LiveIntReg(   R4  ),
175  RegisterSaver_LiveIntReg(   R5  ),
176  RegisterSaver_LiveIntReg(   R6  ),
177  RegisterSaver_LiveIntReg(   R7  ),
178  RegisterSaver_LiveIntReg(   R8  ),
179  RegisterSaver_LiveIntReg(   R9  ),
180  RegisterSaver_LiveIntReg(   R10 ),
181  RegisterSaver_LiveIntReg(   R11 ),
182  RegisterSaver_LiveIntReg(   R12 ),
183  //RegisterSaver_LiveIntReg( R13 ), // system thread id
184  RegisterSaver_LiveIntReg(   R14 ),
185  RegisterSaver_LiveIntReg(   R15 ),
186  RegisterSaver_LiveIntReg(   R16 ),
187  RegisterSaver_LiveIntReg(   R17 ),
188  RegisterSaver_LiveIntReg(   R18 ),
189  RegisterSaver_LiveIntReg(   R19 ),
190  RegisterSaver_LiveIntReg(   R20 ),
191  RegisterSaver_LiveIntReg(   R21 ),
192  RegisterSaver_LiveIntReg(   R22 ),
193  RegisterSaver_LiveIntReg(   R23 ),
194  RegisterSaver_LiveIntReg(   R24 ),
195  RegisterSaver_LiveIntReg(   R25 ),
196  RegisterSaver_LiveIntReg(   R26 ),
197  RegisterSaver_LiveIntReg(   R27 ),
198  RegisterSaver_LiveIntReg(   R28 ),
199  RegisterSaver_LiveIntReg(   R29 ),
200  RegisterSaver_LiveIntReg(   R30 ),
201  RegisterSaver_LiveIntReg(   R31 ), // must be the last register (see save/restore functions below)
202};
203
204OopMap* RegisterSaver::push_frame_reg_args_and_save_live_registers(MacroAssembler* masm,
205                         int* out_frame_size_in_bytes,
206                         bool generate_oop_map,
207                         int return_pc_adjustment,
208                         ReturnPCLocation return_pc_location) {
209  // Push an abi_reg_args-frame and store all registers which may be live.
210  // If requested, create an OopMap: Record volatile registers as
211  // callee-save values in an OopMap so their save locations will be
212  // propagated to the RegisterMap of the caller frame during
213  // StackFrameStream construction (needed for deoptimization; see
214  // compiledVFrame::create_stack_value).
215  // If return_pc_adjustment != 0 adjust the return pc by return_pc_adjustment.
216
217  int i;
218  int offset;
219
220  // calcualte frame size
221  const int regstosave_num       = sizeof(RegisterSaver_LiveRegs) /
222                                   sizeof(RegisterSaver::LiveRegType);
223  const int register_save_size   = regstosave_num * reg_size;
224  const int frame_size_in_bytes  = round_to(register_save_size, frame::alignment_in_bytes)
225                                   + frame::abi_reg_args_size;
226  *out_frame_size_in_bytes       = frame_size_in_bytes;
227  const int frame_size_in_slots  = frame_size_in_bytes / sizeof(jint);
228  const int register_save_offset = frame_size_in_bytes - register_save_size;
229
230  // OopMap frame size is in c2 stack slots (sizeof(jint)) not bytes or words.
231  OopMap* map = generate_oop_map ? new OopMap(frame_size_in_slots, 0) : NULL;
232
233  BLOCK_COMMENT("push_frame_reg_args_and_save_live_registers {");
234
235  // Save r31 in the last slot of the not yet pushed frame so that we
236  // can use it as scratch reg.
237  __ std(R31, -reg_size, R1_SP);
238  assert(-reg_size == register_save_offset - frame_size_in_bytes + ((regstosave_num-1)*reg_size),
239         "consistency check");
240
241  // save the flags
242  // Do the save_LR_CR by hand and adjust the return pc if requested.
243  __ mfcr(R31);
244  __ std(R31, _abi(cr), R1_SP);
245  switch (return_pc_location) {
246    case return_pc_is_lr: __ mflr(R31); break;
247    case return_pc_is_pre_saved: assert(return_pc_adjustment == 0, "unsupported"); break;
248    case return_pc_is_thread_saved_exception_pc: __ ld(R31, thread_(saved_exception_pc)); break;
249    default: ShouldNotReachHere();
250  }
251  if (return_pc_location != return_pc_is_pre_saved) {
252    if (return_pc_adjustment != 0) {
253      __ addi(R31, R31, return_pc_adjustment);
254    }
255    __ std(R31, _abi(lr), R1_SP);
256  }
257
258  // push a new frame
259  __ push_frame(frame_size_in_bytes, R31);
260
261  // save all registers (ints and floats)
262  offset = register_save_offset;
263  for (int i = 0; i < regstosave_num; i++) {
264    int reg_num  = RegisterSaver_LiveRegs[i].reg_num;
265    int reg_type = RegisterSaver_LiveRegs[i].reg_type;
266
267    switch (reg_type) {
268      case RegisterSaver::int_reg: {
269        if (reg_num != 31) { // We spilled R31 right at the beginning.
270          __ std(as_Register(reg_num), offset, R1_SP);
271        }
272        break;
273      }
274      case RegisterSaver::float_reg: {
275        __ stfd(as_FloatRegister(reg_num), offset, R1_SP);
276        break;
277      }
278      case RegisterSaver::special_reg: {
279        if (reg_num == SR_CTR_SpecialRegisterEnumValue) {
280          __ mfctr(R31);
281          __ std(R31, offset, R1_SP);
282        } else {
283          Unimplemented();
284        }
285        break;
286      }
287      default:
288        ShouldNotReachHere();
289    }
290
291    if (generate_oop_map) {
292      map->set_callee_saved(VMRegImpl::stack2reg(offset>>2),
293                            RegisterSaver_LiveRegs[i].vmreg);
294      map->set_callee_saved(VMRegImpl::stack2reg((offset + half_reg_size)>>2),
295                            RegisterSaver_LiveRegs[i].vmreg->next());
296    }
297    offset += reg_size;
298  }
299
300  BLOCK_COMMENT("} push_frame_reg_args_and_save_live_registers");
301
302  // And we're done.
303  return map;
304}
305
306
307// Pop the current frame and restore all the registers that we
308// saved.
309void RegisterSaver::restore_live_registers_and_pop_frame(MacroAssembler* masm,
310                                                         int frame_size_in_bytes,
311                                                         bool restore_ctr) {
312  int i;
313  int offset;
314  const int regstosave_num       = sizeof(RegisterSaver_LiveRegs) /
315                                   sizeof(RegisterSaver::LiveRegType);
316  const int register_save_size   = regstosave_num * reg_size;
317  const int register_save_offset = frame_size_in_bytes - register_save_size;
318
319  BLOCK_COMMENT("restore_live_registers_and_pop_frame {");
320
321  // restore all registers (ints and floats)
322  offset = register_save_offset;
323  for (int i = 0; i < regstosave_num; i++) {
324    int reg_num  = RegisterSaver_LiveRegs[i].reg_num;
325    int reg_type = RegisterSaver_LiveRegs[i].reg_type;
326
327    switch (reg_type) {
328      case RegisterSaver::int_reg: {
329        if (reg_num != 31) // R31 restored at the end, it's the tmp reg!
330          __ ld(as_Register(reg_num), offset, R1_SP);
331        break;
332      }
333      case RegisterSaver::float_reg: {
334        __ lfd(as_FloatRegister(reg_num), offset, R1_SP);
335        break;
336      }
337      case RegisterSaver::special_reg: {
338        if (reg_num == SR_CTR_SpecialRegisterEnumValue) {
339          if (restore_ctr) { // Nothing to do here if ctr already contains the next address.
340            __ ld(R31, offset, R1_SP);
341            __ mtctr(R31);
342          }
343        } else {
344          Unimplemented();
345        }
346        break;
347      }
348      default:
349        ShouldNotReachHere();
350    }
351    offset += reg_size;
352  }
353
354  // pop the frame
355  __ pop_frame();
356
357  // restore the flags
358  __ restore_LR_CR(R31);
359
360  // restore scratch register's value
361  __ ld(R31, -reg_size, R1_SP);
362
363  BLOCK_COMMENT("} restore_live_registers_and_pop_frame");
364}
365
366void RegisterSaver::push_frame_and_save_argument_registers(MacroAssembler* masm, Register r_temp,
367                                                           int frame_size,int total_args, const VMRegPair *regs,
368                                                           const VMRegPair *regs2) {
369  __ push_frame(frame_size, r_temp);
370  int st_off = frame_size - wordSize;
371  for (int i = 0; i < total_args; i++) {
372    VMReg r_1 = regs[i].first();
373    VMReg r_2 = regs[i].second();
374    if (!r_1->is_valid()) {
375      assert(!r_2->is_valid(), "");
376      continue;
377    }
378    if (r_1->is_Register()) {
379      Register r = r_1->as_Register();
380      __ std(r, st_off, R1_SP);
381      st_off -= wordSize;
382    } else if (r_1->is_FloatRegister()) {
383      FloatRegister f = r_1->as_FloatRegister();
384      __ stfd(f, st_off, R1_SP);
385      st_off -= wordSize;
386    }
387  }
388  if (regs2 != NULL) {
389    for (int i = 0; i < total_args; i++) {
390      VMReg r_1 = regs2[i].first();
391      VMReg r_2 = regs2[i].second();
392      if (!r_1->is_valid()) {
393        assert(!r_2->is_valid(), "");
394        continue;
395      }
396      if (r_1->is_Register()) {
397        Register r = r_1->as_Register();
398        __ std(r, st_off, R1_SP);
399        st_off -= wordSize;
400      } else if (r_1->is_FloatRegister()) {
401        FloatRegister f = r_1->as_FloatRegister();
402        __ stfd(f, st_off, R1_SP);
403        st_off -= wordSize;
404      }
405    }
406  }
407}
408
409void RegisterSaver::restore_argument_registers_and_pop_frame(MacroAssembler*masm, int frame_size,
410                                                             int total_args, const VMRegPair *regs,
411                                                             const VMRegPair *regs2) {
412  int st_off = frame_size - wordSize;
413  for (int i = 0; i < total_args; i++) {
414    VMReg r_1 = regs[i].first();
415    VMReg r_2 = regs[i].second();
416    if (r_1->is_Register()) {
417      Register r = r_1->as_Register();
418      __ ld(r, st_off, R1_SP);
419      st_off -= wordSize;
420    } else if (r_1->is_FloatRegister()) {
421      FloatRegister f = r_1->as_FloatRegister();
422      __ lfd(f, st_off, R1_SP);
423      st_off -= wordSize;
424    }
425  }
426  if (regs2 != NULL)
427    for (int i = 0; i < total_args; i++) {
428      VMReg r_1 = regs2[i].first();
429      VMReg r_2 = regs2[i].second();
430      if (r_1->is_Register()) {
431        Register r = r_1->as_Register();
432        __ ld(r, st_off, R1_SP);
433        st_off -= wordSize;
434      } else if (r_1->is_FloatRegister()) {
435        FloatRegister f = r_1->as_FloatRegister();
436        __ lfd(f, st_off, R1_SP);
437        st_off -= wordSize;
438      }
439    }
440  __ pop_frame();
441}
442
443// Restore the registers that might be holding a result.
444void RegisterSaver::restore_result_registers(MacroAssembler* masm, int frame_size_in_bytes) {
445  int i;
446  int offset;
447  const int regstosave_num       = sizeof(RegisterSaver_LiveRegs) /
448                                   sizeof(RegisterSaver::LiveRegType);
449  const int register_save_size   = regstosave_num * reg_size;
450  const int register_save_offset = frame_size_in_bytes - register_save_size;
451
452  // restore all result registers (ints and floats)
453  offset = register_save_offset;
454  for (int i = 0; i < regstosave_num; i++) {
455    int reg_num  = RegisterSaver_LiveRegs[i].reg_num;
456    int reg_type = RegisterSaver_LiveRegs[i].reg_type;
457    switch (reg_type) {
458      case RegisterSaver::int_reg: {
459        if (as_Register(reg_num)==R3_RET) // int result_reg
460          __ ld(as_Register(reg_num), offset, R1_SP);
461        break;
462      }
463      case RegisterSaver::float_reg: {
464        if (as_FloatRegister(reg_num)==F1_RET) // float result_reg
465          __ lfd(as_FloatRegister(reg_num), offset, R1_SP);
466        break;
467      }
468      case RegisterSaver::special_reg: {
469        // Special registers don't hold a result.
470        break;
471      }
472      default:
473        ShouldNotReachHere();
474    }
475    offset += reg_size;
476  }
477}
478
479// Is vector's size (in bytes) bigger than a size saved by default?
480bool SharedRuntime::is_wide_vector(int size) {
481  // Note, MaxVectorSize == 8 on PPC64.
482  assert(size <= 8, "%d bytes vectors are not supported", size);
483  return size > 8;
484}
485
486size_t SharedRuntime::trampoline_size() {
487  return Assembler::load_const_size + 8;
488}
489
490void SharedRuntime::generate_trampoline(MacroAssembler *masm, address destination) {
491  Register Rtemp = R12;
492  __ load_const(Rtemp, destination);
493  __ mtctr(Rtemp);
494  __ bctr();
495}
496
497#ifdef COMPILER2
498static int reg2slot(VMReg r) {
499  return r->reg2stack() + SharedRuntime::out_preserve_stack_slots();
500}
501
502static int reg2offset(VMReg r) {
503  return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
504}
505#endif
506
507// ---------------------------------------------------------------------------
508// Read the array of BasicTypes from a signature, and compute where the
509// arguments should go. Values in the VMRegPair regs array refer to 4-byte
510// quantities. Values less than VMRegImpl::stack0 are registers, those above
511// refer to 4-byte stack slots. All stack slots are based off of the stack pointer
512// as framesizes are fixed.
513// VMRegImpl::stack0 refers to the first slot 0(sp).
514// and VMRegImpl::stack0+1 refers to the memory word 4-bytes higher. Register
515// up to RegisterImpl::number_of_registers) are the 64-bit
516// integer registers.
517
518// Note: the INPUTS in sig_bt are in units of Java argument words, which are
519// either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit
520// units regardless of build. Of course for i486 there is no 64 bit build
521
522// The Java calling convention is a "shifted" version of the C ABI.
523// By skipping the first C ABI register we can call non-static jni methods
524// with small numbers of arguments without having to shuffle the arguments
525// at all. Since we control the java ABI we ought to at least get some
526// advantage out of it.
527
528const VMReg java_iarg_reg[8] = {
529  R3->as_VMReg(),
530  R4->as_VMReg(),
531  R5->as_VMReg(),
532  R6->as_VMReg(),
533  R7->as_VMReg(),
534  R8->as_VMReg(),
535  R9->as_VMReg(),
536  R10->as_VMReg()
537};
538
539const VMReg java_farg_reg[13] = {
540  F1->as_VMReg(),
541  F2->as_VMReg(),
542  F3->as_VMReg(),
543  F4->as_VMReg(),
544  F5->as_VMReg(),
545  F6->as_VMReg(),
546  F7->as_VMReg(),
547  F8->as_VMReg(),
548  F9->as_VMReg(),
549  F10->as_VMReg(),
550  F11->as_VMReg(),
551  F12->as_VMReg(),
552  F13->as_VMReg()
553};
554
555const int num_java_iarg_registers = sizeof(java_iarg_reg) / sizeof(java_iarg_reg[0]);
556const int num_java_farg_registers = sizeof(java_farg_reg) / sizeof(java_farg_reg[0]);
557
558int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
559                                           VMRegPair *regs,
560                                           int total_args_passed,
561                                           int is_outgoing) {
562  // C2c calling conventions for compiled-compiled calls.
563  // Put 8 ints/longs into registers _AND_ 13 float/doubles into
564  // registers _AND_ put the rest on the stack.
565
566  const int inc_stk_for_intfloat   = 1; // 1 slots for ints and floats
567  const int inc_stk_for_longdouble = 2; // 2 slots for longs and doubles
568
569  int i;
570  VMReg reg;
571  int stk = 0;
572  int ireg = 0;
573  int freg = 0;
574
575  // We put the first 8 arguments into registers and the rest on the
576  // stack, float arguments are already in their argument registers
577  // due to c2c calling conventions (see calling_convention).
578  for (int i = 0; i < total_args_passed; ++i) {
579    switch(sig_bt[i]) {
580    case T_BOOLEAN:
581    case T_CHAR:
582    case T_BYTE:
583    case T_SHORT:
584    case T_INT:
585      if (ireg < num_java_iarg_registers) {
586        // Put int/ptr in register
587        reg = java_iarg_reg[ireg];
588        ++ireg;
589      } else {
590        // Put int/ptr on stack.
591        reg = VMRegImpl::stack2reg(stk);
592        stk += inc_stk_for_intfloat;
593      }
594      regs[i].set1(reg);
595      break;
596    case T_LONG:
597      assert((i + 1) < total_args_passed && sig_bt[i+1] == T_VOID, "expecting half");
598      if (ireg < num_java_iarg_registers) {
599        // Put long in register.
600        reg = java_iarg_reg[ireg];
601        ++ireg;
602      } else {
603        // Put long on stack. They must be aligned to 2 slots.
604        if (stk & 0x1) ++stk;
605        reg = VMRegImpl::stack2reg(stk);
606        stk += inc_stk_for_longdouble;
607      }
608      regs[i].set2(reg);
609      break;
610    case T_OBJECT:
611    case T_ARRAY:
612    case T_ADDRESS:
613      if (ireg < num_java_iarg_registers) {
614        // Put ptr in register.
615        reg = java_iarg_reg[ireg];
616        ++ireg;
617      } else {
618        // Put ptr on stack. Objects must be aligned to 2 slots too,
619        // because "64-bit pointers record oop-ishness on 2 aligned
620        // adjacent registers." (see OopFlow::build_oop_map).
621        if (stk & 0x1) ++stk;
622        reg = VMRegImpl::stack2reg(stk);
623        stk += inc_stk_for_longdouble;
624      }
625      regs[i].set2(reg);
626      break;
627    case T_FLOAT:
628      if (freg < num_java_farg_registers) {
629        // Put float in register.
630        reg = java_farg_reg[freg];
631        ++freg;
632      } else {
633        // Put float on stack.
634        reg = VMRegImpl::stack2reg(stk);
635        stk += inc_stk_for_intfloat;
636      }
637      regs[i].set1(reg);
638      break;
639    case T_DOUBLE:
640      assert((i + 1) < total_args_passed && sig_bt[i+1] == T_VOID, "expecting half");
641      if (freg < num_java_farg_registers) {
642        // Put double in register.
643        reg = java_farg_reg[freg];
644        ++freg;
645      } else {
646        // Put double on stack. They must be aligned to 2 slots.
647        if (stk & 0x1) ++stk;
648        reg = VMRegImpl::stack2reg(stk);
649        stk += inc_stk_for_longdouble;
650      }
651      regs[i].set2(reg);
652      break;
653    case T_VOID:
654      // Do not count halves.
655      regs[i].set_bad();
656      break;
657    default:
658      ShouldNotReachHere();
659    }
660  }
661  return round_to(stk, 2);
662}
663
664#if defined(COMPILER1) || defined(COMPILER2)
665// Calling convention for calling C code.
666int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
667                                        VMRegPair *regs,
668                                        VMRegPair *regs2,
669                                        int total_args_passed) {
670  // Calling conventions for C runtime calls and calls to JNI native methods.
671  //
672  // PPC64 convention: Hoist the first 8 int/ptr/long's in the first 8
673  // int regs, leaving int regs undefined if the arg is flt/dbl. Hoist
674  // the first 13 flt/dbl's in the first 13 fp regs but additionally
675  // copy flt/dbl to the stack if they are beyond the 8th argument.
676
677  const VMReg iarg_reg[8] = {
678    R3->as_VMReg(),
679    R4->as_VMReg(),
680    R5->as_VMReg(),
681    R6->as_VMReg(),
682    R7->as_VMReg(),
683    R8->as_VMReg(),
684    R9->as_VMReg(),
685    R10->as_VMReg()
686  };
687
688  const VMReg farg_reg[13] = {
689    F1->as_VMReg(),
690    F2->as_VMReg(),
691    F3->as_VMReg(),
692    F4->as_VMReg(),
693    F5->as_VMReg(),
694    F6->as_VMReg(),
695    F7->as_VMReg(),
696    F8->as_VMReg(),
697    F9->as_VMReg(),
698    F10->as_VMReg(),
699    F11->as_VMReg(),
700    F12->as_VMReg(),
701    F13->as_VMReg()
702  };
703
704  // Check calling conventions consistency.
705  assert(sizeof(iarg_reg) / sizeof(iarg_reg[0]) == Argument::n_int_register_parameters_c &&
706         sizeof(farg_reg) / sizeof(farg_reg[0]) == Argument::n_float_register_parameters_c,
707         "consistency");
708
709  // `Stk' counts stack slots. Due to alignment, 32 bit values occupy
710  // 2 such slots, like 64 bit values do.
711  const int inc_stk_for_intfloat   = 2; // 2 slots for ints and floats
712  const int inc_stk_for_longdouble = 2; // 2 slots for longs and doubles
713
714  int i;
715  VMReg reg;
716  // Leave room for C-compatible ABI_REG_ARGS.
717  int stk = (frame::abi_reg_args_size - frame::jit_out_preserve_size) / VMRegImpl::stack_slot_size;
718  int arg = 0;
719  int freg = 0;
720
721  // Avoid passing C arguments in the wrong stack slots.
722#if defined(ABI_ELFv2)
723  assert((SharedRuntime::out_preserve_stack_slots() + stk) * VMRegImpl::stack_slot_size == 96,
724         "passing C arguments in wrong stack slots");
725#else
726  assert((SharedRuntime::out_preserve_stack_slots() + stk) * VMRegImpl::stack_slot_size == 112,
727         "passing C arguments in wrong stack slots");
728#endif
729  // We fill-out regs AND regs2 if an argument must be passed in a
730  // register AND in a stack slot. If regs2 is NULL in such a
731  // situation, we bail-out with a fatal error.
732  for (int i = 0; i < total_args_passed; ++i, ++arg) {
733    // Initialize regs2 to BAD.
734    if (regs2 != NULL) regs2[i].set_bad();
735
736    switch(sig_bt[i]) {
737
738    //
739    // If arguments 0-7 are integers, they are passed in integer registers.
740    // Argument i is placed in iarg_reg[i].
741    //
742    case T_BOOLEAN:
743    case T_CHAR:
744    case T_BYTE:
745    case T_SHORT:
746    case T_INT:
747      // We must cast ints to longs and use full 64 bit stack slots
748      // here.  Thus fall through, handle as long.
749    case T_LONG:
750    case T_OBJECT:
751    case T_ARRAY:
752    case T_ADDRESS:
753    case T_METADATA:
754      // Oops are already boxed if required (JNI).
755      if (arg < Argument::n_int_register_parameters_c) {
756        reg = iarg_reg[arg];
757      } else {
758        reg = VMRegImpl::stack2reg(stk);
759        stk += inc_stk_for_longdouble;
760      }
761      regs[i].set2(reg);
762      break;
763
764    //
765    // Floats are treated differently from int regs:  The first 13 float arguments
766    // are passed in registers (not the float args among the first 13 args).
767    // Thus argument i is NOT passed in farg_reg[i] if it is float.  It is passed
768    // in farg_reg[j] if argument i is the j-th float argument of this call.
769    //
770    case T_FLOAT:
771#if defined(LINUX)
772      // Linux uses ELF ABI. Both original ELF and ELFv2 ABIs have float
773      // in the least significant word of an argument slot.
774#if defined(VM_LITTLE_ENDIAN)
775#define FLOAT_WORD_OFFSET_IN_SLOT 0
776#else
777#define FLOAT_WORD_OFFSET_IN_SLOT 1
778#endif
779#elif defined(AIX)
780      // Although AIX runs on big endian CPU, float is in the most
781      // significant word of an argument slot.
782#define FLOAT_WORD_OFFSET_IN_SLOT 0
783#else
784#error "unknown OS"
785#endif
786      if (freg < Argument::n_float_register_parameters_c) {
787        // Put float in register ...
788        reg = farg_reg[freg];
789        ++freg;
790
791        // Argument i for i > 8 is placed on the stack even if it's
792        // placed in a register (if it's a float arg). Aix disassembly
793        // shows that xlC places these float args on the stack AND in
794        // a register. This is not documented, but we follow this
795        // convention, too.
796        if (arg >= Argument::n_regs_not_on_stack_c) {
797          // ... and on the stack.
798          guarantee(regs2 != NULL, "must pass float in register and stack slot");
799          VMReg reg2 = VMRegImpl::stack2reg(stk + FLOAT_WORD_OFFSET_IN_SLOT);
800          regs2[i].set1(reg2);
801          stk += inc_stk_for_intfloat;
802        }
803
804      } else {
805        // Put float on stack.
806        reg = VMRegImpl::stack2reg(stk + FLOAT_WORD_OFFSET_IN_SLOT);
807        stk += inc_stk_for_intfloat;
808      }
809      regs[i].set1(reg);
810      break;
811    case T_DOUBLE:
812      assert((i + 1) < total_args_passed && sig_bt[i+1] == T_VOID, "expecting half");
813      if (freg < Argument::n_float_register_parameters_c) {
814        // Put double in register ...
815        reg = farg_reg[freg];
816        ++freg;
817
818        // Argument i for i > 8 is placed on the stack even if it's
819        // placed in a register (if it's a double arg). Aix disassembly
820        // shows that xlC places these float args on the stack AND in
821        // a register. This is not documented, but we follow this
822        // convention, too.
823        if (arg >= Argument::n_regs_not_on_stack_c) {
824          // ... and on the stack.
825          guarantee(regs2 != NULL, "must pass float in register and stack slot");
826          VMReg reg2 = VMRegImpl::stack2reg(stk);
827          regs2[i].set2(reg2);
828          stk += inc_stk_for_longdouble;
829        }
830      } else {
831        // Put double on stack.
832        reg = VMRegImpl::stack2reg(stk);
833        stk += inc_stk_for_longdouble;
834      }
835      regs[i].set2(reg);
836      break;
837
838    case T_VOID:
839      // Do not count halves.
840      regs[i].set_bad();
841      --arg;
842      break;
843    default:
844      ShouldNotReachHere();
845    }
846  }
847
848  return round_to(stk, 2);
849}
850#endif // COMPILER2
851
852static address gen_c2i_adapter(MacroAssembler *masm,
853                            int total_args_passed,
854                            int comp_args_on_stack,
855                            const BasicType *sig_bt,
856                            const VMRegPair *regs,
857                            Label& call_interpreter,
858                            const Register& ientry) {
859
860  address c2i_entrypoint;
861
862  const Register sender_SP = R21_sender_SP; // == R21_tmp1
863  const Register code      = R22_tmp2;
864  //const Register ientry  = R23_tmp3;
865  const Register value_regs[] = { R24_tmp4, R25_tmp5, R26_tmp6 };
866  const int num_value_regs = sizeof(value_regs) / sizeof(Register);
867  int value_regs_index = 0;
868
869  const Register return_pc = R27_tmp7;
870  const Register tmp       = R28_tmp8;
871
872  assert_different_registers(sender_SP, code, ientry, return_pc, tmp);
873
874  // Adapter needs TOP_IJAVA_FRAME_ABI.
875  const int adapter_size = frame::top_ijava_frame_abi_size +
876                           round_to(total_args_passed * wordSize, frame::alignment_in_bytes);
877
878  // regular (verified) c2i entry point
879  c2i_entrypoint = __ pc();
880
881  // Does compiled code exists? If yes, patch the caller's callsite.
882  __ ld(code, method_(code));
883  __ cmpdi(CCR0, code, 0);
884  __ ld(ientry, method_(interpreter_entry)); // preloaded
885  __ beq(CCR0, call_interpreter);
886
887
888  // Patch caller's callsite, method_(code) was not NULL which means that
889  // compiled code exists.
890  __ mflr(return_pc);
891  __ std(return_pc, _abi(lr), R1_SP);
892  RegisterSaver::push_frame_and_save_argument_registers(masm, tmp, adapter_size, total_args_passed, regs);
893
894  __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite), R19_method, return_pc);
895
896  RegisterSaver::restore_argument_registers_and_pop_frame(masm, adapter_size, total_args_passed, regs);
897  __ ld(return_pc, _abi(lr), R1_SP);
898  __ ld(ientry, method_(interpreter_entry)); // preloaded
899  __ mtlr(return_pc);
900
901
902  // Call the interpreter.
903  __ BIND(call_interpreter);
904  __ mtctr(ientry);
905
906  // Get a copy of the current SP for loading caller's arguments.
907  __ mr(sender_SP, R1_SP);
908
909  // Add space for the adapter.
910  __ resize_frame(-adapter_size, R12_scratch2);
911
912  int st_off = adapter_size - wordSize;
913
914  // Write the args into the outgoing interpreter space.
915  for (int i = 0; i < total_args_passed; i++) {
916    VMReg r_1 = regs[i].first();
917    VMReg r_2 = regs[i].second();
918    if (!r_1->is_valid()) {
919      assert(!r_2->is_valid(), "");
920      continue;
921    }
922    if (r_1->is_stack()) {
923      Register tmp_reg = value_regs[value_regs_index];
924      value_regs_index = (value_regs_index + 1) % num_value_regs;
925      // The calling convention produces OptoRegs that ignore the out
926      // preserve area (JIT's ABI). We must account for it here.
927      int ld_off = (r_1->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
928      if (!r_2->is_valid()) {
929        __ lwz(tmp_reg, ld_off, sender_SP);
930      } else {
931        __ ld(tmp_reg, ld_off, sender_SP);
932      }
933      // Pretend stack targets were loaded into tmp_reg.
934      r_1 = tmp_reg->as_VMReg();
935    }
936
937    if (r_1->is_Register()) {
938      Register r = r_1->as_Register();
939      if (!r_2->is_valid()) {
940        __ stw(r, st_off, R1_SP);
941        st_off-=wordSize;
942      } else {
943        // Longs are given 2 64-bit slots in the interpreter, but the
944        // data is passed in only 1 slot.
945        if (sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
946          DEBUG_ONLY( __ li(tmp, 0); __ std(tmp, st_off, R1_SP); )
947          st_off-=wordSize;
948        }
949        __ std(r, st_off, R1_SP);
950        st_off-=wordSize;
951      }
952    } else {
953      assert(r_1->is_FloatRegister(), "");
954      FloatRegister f = r_1->as_FloatRegister();
955      if (!r_2->is_valid()) {
956        __ stfs(f, st_off, R1_SP);
957        st_off-=wordSize;
958      } else {
959        // In 64bit, doubles are given 2 64-bit slots in the interpreter, but the
960        // data is passed in only 1 slot.
961        // One of these should get known junk...
962        DEBUG_ONLY( __ li(tmp, 0); __ std(tmp, st_off, R1_SP); )
963        st_off-=wordSize;
964        __ stfd(f, st_off, R1_SP);
965        st_off-=wordSize;
966      }
967    }
968  }
969
970  // Jump to the interpreter just as if interpreter was doing it.
971
972  __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R11_scratch1);
973
974  // load TOS
975  __ addi(R15_esp, R1_SP, st_off);
976
977  // Frame_manager expects initial_caller_sp (= SP without resize by c2i) in R21_tmp1.
978  assert(sender_SP == R21_sender_SP, "passing initial caller's SP in wrong register");
979  __ bctr();
980
981  return c2i_entrypoint;
982}
983
984void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
985                                    int total_args_passed,
986                                    int comp_args_on_stack,
987                                    const BasicType *sig_bt,
988                                    const VMRegPair *regs) {
989
990  // Load method's entry-point from method.
991  __ ld(R12_scratch2, in_bytes(Method::from_compiled_offset()), R19_method);
992  __ mtctr(R12_scratch2);
993
994  // We will only enter here from an interpreted frame and never from after
995  // passing thru a c2i. Azul allowed this but we do not. If we lose the
996  // race and use a c2i we will remain interpreted for the race loser(s).
997  // This removes all sorts of headaches on the x86 side and also eliminates
998  // the possibility of having c2i -> i2c -> c2i -> ... endless transitions.
999
1000  // Note: r13 contains the senderSP on entry. We must preserve it since
1001  // we may do a i2c -> c2i transition if we lose a race where compiled
1002  // code goes non-entrant while we get args ready.
1003  // In addition we use r13 to locate all the interpreter args as
1004  // we must align the stack to 16 bytes on an i2c entry else we
1005  // lose alignment we expect in all compiled code and register
1006  // save code can segv when fxsave instructions find improperly
1007  // aligned stack pointer.
1008
1009  const Register ld_ptr = R15_esp;
1010  const Register value_regs[] = { R22_tmp2, R23_tmp3, R24_tmp4, R25_tmp5, R26_tmp6 };
1011  const int num_value_regs = sizeof(value_regs) / sizeof(Register);
1012  int value_regs_index = 0;
1013
1014  int ld_offset = total_args_passed*wordSize;
1015
1016  // Cut-out for having no stack args. Since up to 2 int/oop args are passed
1017  // in registers, we will occasionally have no stack args.
1018  int comp_words_on_stack = 0;
1019  if (comp_args_on_stack) {
1020    // Sig words on the stack are greater-than VMRegImpl::stack0. Those in
1021    // registers are below. By subtracting stack0, we either get a negative
1022    // number (all values in registers) or the maximum stack slot accessed.
1023
1024    // Convert 4-byte c2 stack slots to words.
1025    comp_words_on_stack = round_to(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1026    // Round up to miminum stack alignment, in wordSize.
1027    comp_words_on_stack = round_to(comp_words_on_stack, 2);
1028    __ resize_frame(-comp_words_on_stack * wordSize, R11_scratch1);
1029  }
1030
1031  // Now generate the shuffle code.  Pick up all register args and move the
1032  // rest through register value=Z_R12.
1033  BLOCK_COMMENT("Shuffle arguments");
1034  for (int i = 0; i < total_args_passed; i++) {
1035    if (sig_bt[i] == T_VOID) {
1036      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
1037      continue;
1038    }
1039
1040    // Pick up 0, 1 or 2 words from ld_ptr.
1041    assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1042            "scrambled load targets?");
1043    VMReg r_1 = regs[i].first();
1044    VMReg r_2 = regs[i].second();
1045    if (!r_1->is_valid()) {
1046      assert(!r_2->is_valid(), "");
1047      continue;
1048    }
1049    if (r_1->is_FloatRegister()) {
1050      if (!r_2->is_valid()) {
1051        __ lfs(r_1->as_FloatRegister(), ld_offset, ld_ptr);
1052        ld_offset-=wordSize;
1053      } else {
1054        // Skip the unused interpreter slot.
1055        __ lfd(r_1->as_FloatRegister(), ld_offset-wordSize, ld_ptr);
1056        ld_offset-=2*wordSize;
1057      }
1058    } else {
1059      Register r;
1060      if (r_1->is_stack()) {
1061        // Must do a memory to memory move thru "value".
1062        r = value_regs[value_regs_index];
1063        value_regs_index = (value_regs_index + 1) % num_value_regs;
1064      } else {
1065        r = r_1->as_Register();
1066      }
1067      if (!r_2->is_valid()) {
1068        // Not sure we need to do this but it shouldn't hurt.
1069        if (sig_bt[i] == T_OBJECT || sig_bt[i] == T_ADDRESS || sig_bt[i] == T_ARRAY) {
1070          __ ld(r, ld_offset, ld_ptr);
1071          ld_offset-=wordSize;
1072        } else {
1073          __ lwz(r, ld_offset, ld_ptr);
1074          ld_offset-=wordSize;
1075        }
1076      } else {
1077        // In 64bit, longs are given 2 64-bit slots in the interpreter, but the
1078        // data is passed in only 1 slot.
1079        if (sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
1080          ld_offset-=wordSize;
1081        }
1082        __ ld(r, ld_offset, ld_ptr);
1083        ld_offset-=wordSize;
1084      }
1085
1086      if (r_1->is_stack()) {
1087        // Now store value where the compiler expects it
1088        int st_off = (r_1->reg2stack() + SharedRuntime::out_preserve_stack_slots())*VMRegImpl::stack_slot_size;
1089
1090        if (sig_bt[i] == T_INT   || sig_bt[i] == T_FLOAT ||sig_bt[i] == T_BOOLEAN ||
1091            sig_bt[i] == T_SHORT || sig_bt[i] == T_CHAR  || sig_bt[i] == T_BYTE) {
1092          __ stw(r, st_off, R1_SP);
1093        } else {
1094          __ std(r, st_off, R1_SP);
1095        }
1096      }
1097    }
1098  }
1099
1100  BLOCK_COMMENT("Store method");
1101  // Store method into thread->callee_target.
1102  // We might end up in handle_wrong_method if the callee is
1103  // deoptimized as we race thru here. If that happens we don't want
1104  // to take a safepoint because the caller frame will look
1105  // interpreted and arguments are now "compiled" so it is much better
1106  // to make this transition invisible to the stack walking
1107  // code. Unfortunately if we try and find the callee by normal means
1108  // a safepoint is possible. So we stash the desired callee in the
1109  // thread and the vm will find there should this case occur.
1110  __ std(R19_method, thread_(callee_target));
1111
1112  // Jump to the compiled code just as if compiled code was doing it.
1113  __ bctr();
1114}
1115
1116AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1117                                                            int total_args_passed,
1118                                                            int comp_args_on_stack,
1119                                                            const BasicType *sig_bt,
1120                                                            const VMRegPair *regs,
1121                                                            AdapterFingerPrint* fingerprint) {
1122  address i2c_entry;
1123  address c2i_unverified_entry;
1124  address c2i_entry;
1125
1126
1127  // entry: i2c
1128
1129  __ align(CodeEntryAlignment);
1130  i2c_entry = __ pc();
1131  gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1132
1133
1134  // entry: c2i unverified
1135
1136  __ align(CodeEntryAlignment);
1137  BLOCK_COMMENT("c2i unverified entry");
1138  c2i_unverified_entry = __ pc();
1139
1140  // inline_cache contains a compiledICHolder
1141  const Register ic             = R19_method;
1142  const Register ic_klass       = R11_scratch1;
1143  const Register receiver_klass = R12_scratch2;
1144  const Register code           = R21_tmp1;
1145  const Register ientry         = R23_tmp3;
1146
1147  assert_different_registers(ic, ic_klass, receiver_klass, R3_ARG1, code, ientry);
1148  assert(R11_scratch1 == R11, "need prologue scratch register");
1149
1150  Label call_interpreter;
1151
1152  assert(!MacroAssembler::needs_explicit_null_check(oopDesc::klass_offset_in_bytes()),
1153         "klass offset should reach into any page");
1154  // Check for NULL argument if we don't have implicit null checks.
1155  if (!ImplicitNullChecks || !os::zero_page_read_protected()) {
1156    if (TrapBasedNullChecks) {
1157      __ trap_null_check(R3_ARG1);
1158    } else {
1159      Label valid;
1160      __ cmpdi(CCR0, R3_ARG1, 0);
1161      __ bne_predict_taken(CCR0, valid);
1162      // We have a null argument, branch to ic_miss_stub.
1163      __ b64_patchable((address)SharedRuntime::get_ic_miss_stub(),
1164                       relocInfo::runtime_call_type);
1165      __ BIND(valid);
1166    }
1167  }
1168  // Assume argument is not NULL, load klass from receiver.
1169  __ load_klass(receiver_klass, R3_ARG1);
1170
1171  __ ld(ic_klass, CompiledICHolder::holder_klass_offset(), ic);
1172
1173  if (TrapBasedICMissChecks) {
1174    __ trap_ic_miss_check(receiver_klass, ic_klass);
1175  } else {
1176    Label valid;
1177    __ cmpd(CCR0, receiver_klass, ic_klass);
1178    __ beq_predict_taken(CCR0, valid);
1179    // We have an unexpected klass, branch to ic_miss_stub.
1180    __ b64_patchable((address)SharedRuntime::get_ic_miss_stub(),
1181                     relocInfo::runtime_call_type);
1182    __ BIND(valid);
1183  }
1184
1185  // Argument is valid and klass is as expected, continue.
1186
1187  // Extract method from inline cache, verified entry point needs it.
1188  __ ld(R19_method, CompiledICHolder::holder_method_offset(), ic);
1189  assert(R19_method == ic, "the inline cache register is dead here");
1190
1191  __ ld(code, method_(code));
1192  __ cmpdi(CCR0, code, 0);
1193  __ ld(ientry, method_(interpreter_entry)); // preloaded
1194  __ beq_predict_taken(CCR0, call_interpreter);
1195
1196  // Branch to ic_miss_stub.
1197  __ b64_patchable((address)SharedRuntime::get_ic_miss_stub(), relocInfo::runtime_call_type);
1198
1199  // entry: c2i
1200
1201  c2i_entry = gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, call_interpreter, ientry);
1202
1203  return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry);
1204}
1205
1206#ifdef COMPILER2
1207// An oop arg. Must pass a handle not the oop itself.
1208static void object_move(MacroAssembler* masm,
1209                        int frame_size_in_slots,
1210                        OopMap* oop_map, int oop_handle_offset,
1211                        bool is_receiver, int* receiver_offset,
1212                        VMRegPair src, VMRegPair dst,
1213                        Register r_caller_sp, Register r_temp_1, Register r_temp_2) {
1214  assert(!is_receiver || (is_receiver && (*receiver_offset == -1)),
1215         "receiver has already been moved");
1216
1217  // We must pass a handle. First figure out the location we use as a handle.
1218
1219  if (src.first()->is_stack()) {
1220    // stack to stack or reg
1221
1222    const Register r_handle = dst.first()->is_stack() ? r_temp_1 : dst.first()->as_Register();
1223    Label skip;
1224    const int oop_slot_in_callers_frame = reg2slot(src.first());
1225
1226    guarantee(!is_receiver, "expecting receiver in register");
1227    oop_map->set_oop(VMRegImpl::stack2reg(oop_slot_in_callers_frame + frame_size_in_slots));
1228
1229    __ addi(r_handle, r_caller_sp, reg2offset(src.first()));
1230    __ ld(  r_temp_2, reg2offset(src.first()), r_caller_sp);
1231    __ cmpdi(CCR0, r_temp_2, 0);
1232    __ bne(CCR0, skip);
1233    // Use a NULL handle if oop is NULL.
1234    __ li(r_handle, 0);
1235    __ bind(skip);
1236
1237    if (dst.first()->is_stack()) {
1238      // stack to stack
1239      __ std(r_handle, reg2offset(dst.first()), R1_SP);
1240    } else {
1241      // stack to reg
1242      // Nothing to do, r_handle is already the dst register.
1243    }
1244  } else {
1245    // reg to stack or reg
1246    const Register r_oop      = src.first()->as_Register();
1247    const Register r_handle   = dst.first()->is_stack() ? r_temp_1 : dst.first()->as_Register();
1248    const int oop_slot        = (r_oop->encoding()-R3_ARG1->encoding()) * VMRegImpl::slots_per_word
1249                                + oop_handle_offset; // in slots
1250    const int oop_offset = oop_slot * VMRegImpl::stack_slot_size;
1251    Label skip;
1252
1253    if (is_receiver) {
1254      *receiver_offset = oop_offset;
1255    }
1256    oop_map->set_oop(VMRegImpl::stack2reg(oop_slot));
1257
1258    __ std( r_oop,    oop_offset, R1_SP);
1259    __ addi(r_handle, R1_SP, oop_offset);
1260
1261    __ cmpdi(CCR0, r_oop, 0);
1262    __ bne(CCR0, skip);
1263    // Use a NULL handle if oop is NULL.
1264    __ li(r_handle, 0);
1265    __ bind(skip);
1266
1267    if (dst.first()->is_stack()) {
1268      // reg to stack
1269      __ std(r_handle, reg2offset(dst.first()), R1_SP);
1270    } else {
1271      // reg to reg
1272      // Nothing to do, r_handle is already the dst register.
1273    }
1274  }
1275}
1276
1277static void int_move(MacroAssembler*masm,
1278                     VMRegPair src, VMRegPair dst,
1279                     Register r_caller_sp, Register r_temp) {
1280  assert(src.first()->is_valid(), "incoming must be int");
1281  assert(dst.first()->is_valid() && dst.second() == dst.first()->next(), "outgoing must be long");
1282
1283  if (src.first()->is_stack()) {
1284    if (dst.first()->is_stack()) {
1285      // stack to stack
1286      __ lwa(r_temp, reg2offset(src.first()), r_caller_sp);
1287      __ std(r_temp, reg2offset(dst.first()), R1_SP);
1288    } else {
1289      // stack to reg
1290      __ lwa(dst.first()->as_Register(), reg2offset(src.first()), r_caller_sp);
1291    }
1292  } else if (dst.first()->is_stack()) {
1293    // reg to stack
1294    __ extsw(r_temp, src.first()->as_Register());
1295    __ std(r_temp, reg2offset(dst.first()), R1_SP);
1296  } else {
1297    // reg to reg
1298    __ extsw(dst.first()->as_Register(), src.first()->as_Register());
1299  }
1300}
1301
1302static void long_move(MacroAssembler*masm,
1303                      VMRegPair src, VMRegPair dst,
1304                      Register r_caller_sp, Register r_temp) {
1305  assert(src.first()->is_valid() && src.second() == src.first()->next(), "incoming must be long");
1306  assert(dst.first()->is_valid() && dst.second() == dst.first()->next(), "outgoing must be long");
1307
1308  if (src.first()->is_stack()) {
1309    if (dst.first()->is_stack()) {
1310      // stack to stack
1311      __ ld( r_temp, reg2offset(src.first()), r_caller_sp);
1312      __ std(r_temp, reg2offset(dst.first()), R1_SP);
1313    } else {
1314      // stack to reg
1315      __ ld(dst.first()->as_Register(), reg2offset(src.first()), r_caller_sp);
1316    }
1317  } else if (dst.first()->is_stack()) {
1318    // reg to stack
1319    __ std(src.first()->as_Register(), reg2offset(dst.first()), R1_SP);
1320  } else {
1321    // reg to reg
1322    if (dst.first()->as_Register() != src.first()->as_Register())
1323      __ mr(dst.first()->as_Register(), src.first()->as_Register());
1324  }
1325}
1326
1327static void float_move(MacroAssembler*masm,
1328                       VMRegPair src, VMRegPair dst,
1329                       Register r_caller_sp, Register r_temp) {
1330  assert(src.first()->is_valid() && !src.second()->is_valid(), "incoming must be float");
1331  assert(dst.first()->is_valid() && !dst.second()->is_valid(), "outgoing must be float");
1332
1333  if (src.first()->is_stack()) {
1334    if (dst.first()->is_stack()) {
1335      // stack to stack
1336      __ lwz(r_temp, reg2offset(src.first()), r_caller_sp);
1337      __ stw(r_temp, reg2offset(dst.first()), R1_SP);
1338    } else {
1339      // stack to reg
1340      __ lfs(dst.first()->as_FloatRegister(), reg2offset(src.first()), r_caller_sp);
1341    }
1342  } else if (dst.first()->is_stack()) {
1343    // reg to stack
1344    __ stfs(src.first()->as_FloatRegister(), reg2offset(dst.first()), R1_SP);
1345  } else {
1346    // reg to reg
1347    if (dst.first()->as_FloatRegister() != src.first()->as_FloatRegister())
1348      __ fmr(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
1349  }
1350}
1351
1352static void double_move(MacroAssembler*masm,
1353                        VMRegPair src, VMRegPair dst,
1354                        Register r_caller_sp, Register r_temp) {
1355  assert(src.first()->is_valid() && src.second() == src.first()->next(), "incoming must be double");
1356  assert(dst.first()->is_valid() && dst.second() == dst.first()->next(), "outgoing must be double");
1357
1358  if (src.first()->is_stack()) {
1359    if (dst.first()->is_stack()) {
1360      // stack to stack
1361      __ ld( r_temp, reg2offset(src.first()), r_caller_sp);
1362      __ std(r_temp, reg2offset(dst.first()), R1_SP);
1363    } else {
1364      // stack to reg
1365      __ lfd(dst.first()->as_FloatRegister(), reg2offset(src.first()), r_caller_sp);
1366    }
1367  } else if (dst.first()->is_stack()) {
1368    // reg to stack
1369    __ stfd(src.first()->as_FloatRegister(), reg2offset(dst.first()), R1_SP);
1370  } else {
1371    // reg to reg
1372    if (dst.first()->as_FloatRegister() != src.first()->as_FloatRegister())
1373      __ fmr(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
1374  }
1375}
1376
1377void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1378  switch (ret_type) {
1379    case T_BOOLEAN:
1380    case T_CHAR:
1381    case T_BYTE:
1382    case T_SHORT:
1383    case T_INT:
1384      __ stw (R3_RET,  frame_slots*VMRegImpl::stack_slot_size, R1_SP);
1385      break;
1386    case T_ARRAY:
1387    case T_OBJECT:
1388    case T_LONG:
1389      __ std (R3_RET,  frame_slots*VMRegImpl::stack_slot_size, R1_SP);
1390      break;
1391    case T_FLOAT:
1392      __ stfs(F1_RET, frame_slots*VMRegImpl::stack_slot_size, R1_SP);
1393      break;
1394    case T_DOUBLE:
1395      __ stfd(F1_RET, frame_slots*VMRegImpl::stack_slot_size, R1_SP);
1396      break;
1397    case T_VOID:
1398      break;
1399    default:
1400      ShouldNotReachHere();
1401      break;
1402  }
1403}
1404
1405void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1406  switch (ret_type) {
1407    case T_BOOLEAN:
1408    case T_CHAR:
1409    case T_BYTE:
1410    case T_SHORT:
1411    case T_INT:
1412      __ lwz(R3_RET,  frame_slots*VMRegImpl::stack_slot_size, R1_SP);
1413      break;
1414    case T_ARRAY:
1415    case T_OBJECT:
1416    case T_LONG:
1417      __ ld (R3_RET,  frame_slots*VMRegImpl::stack_slot_size, R1_SP);
1418      break;
1419    case T_FLOAT:
1420      __ lfs(F1_RET, frame_slots*VMRegImpl::stack_slot_size, R1_SP);
1421      break;
1422    case T_DOUBLE:
1423      __ lfd(F1_RET, frame_slots*VMRegImpl::stack_slot_size, R1_SP);
1424      break;
1425    case T_VOID:
1426      break;
1427    default:
1428      ShouldNotReachHere();
1429      break;
1430  }
1431}
1432
1433static void save_or_restore_arguments(MacroAssembler* masm,
1434                                      const int stack_slots,
1435                                      const int total_in_args,
1436                                      const int arg_save_area,
1437                                      OopMap* map,
1438                                      VMRegPair* in_regs,
1439                                      BasicType* in_sig_bt) {
1440  // If map is non-NULL then the code should store the values,
1441  // otherwise it should load them.
1442  int slot = arg_save_area;
1443  // Save down double word first.
1444  for (int i = 0; i < total_in_args; i++) {
1445    if (in_regs[i].first()->is_FloatRegister() && in_sig_bt[i] == T_DOUBLE) {
1446      int offset = slot * VMRegImpl::stack_slot_size;
1447      slot += VMRegImpl::slots_per_word;
1448      assert(slot <= stack_slots, "overflow (after DOUBLE stack slot)");
1449      if (map != NULL) {
1450        __ stfd(in_regs[i].first()->as_FloatRegister(), offset, R1_SP);
1451      } else {
1452        __ lfd(in_regs[i].first()->as_FloatRegister(), offset, R1_SP);
1453      }
1454    } else if (in_regs[i].first()->is_Register() &&
1455        (in_sig_bt[i] == T_LONG || in_sig_bt[i] == T_ARRAY)) {
1456      int offset = slot * VMRegImpl::stack_slot_size;
1457      if (map != NULL) {
1458        __ std(in_regs[i].first()->as_Register(), offset, R1_SP);
1459        if (in_sig_bt[i] == T_ARRAY) {
1460          map->set_oop(VMRegImpl::stack2reg(slot));
1461        }
1462      } else {
1463        __ ld(in_regs[i].first()->as_Register(), offset, R1_SP);
1464      }
1465      slot += VMRegImpl::slots_per_word;
1466      assert(slot <= stack_slots, "overflow (after LONG/ARRAY stack slot)");
1467    }
1468  }
1469  // Save or restore single word registers.
1470  for (int i = 0; i < total_in_args; i++) {
1471    // PPC64: pass ints as longs: must only deal with floats here.
1472    if (in_regs[i].first()->is_FloatRegister()) {
1473      if (in_sig_bt[i] == T_FLOAT) {
1474        int offset = slot * VMRegImpl::stack_slot_size;
1475        slot++;
1476        assert(slot <= stack_slots, "overflow (after FLOAT stack slot)");
1477        if (map != NULL) {
1478          __ stfs(in_regs[i].first()->as_FloatRegister(), offset, R1_SP);
1479        } else {
1480          __ lfs(in_regs[i].first()->as_FloatRegister(), offset, R1_SP);
1481        }
1482      }
1483    } else if (in_regs[i].first()->is_stack()) {
1484      if (in_sig_bt[i] == T_ARRAY && map != NULL) {
1485        int offset_in_older_frame = in_regs[i].first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1486        map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + stack_slots));
1487      }
1488    }
1489  }
1490}
1491
1492// Check GCLocker::needs_gc and enter the runtime if it's true. This
1493// keeps a new JNI critical region from starting until a GC has been
1494// forced. Save down any oops in registers and describe them in an
1495// OopMap.
1496static void check_needs_gc_for_critical_native(MacroAssembler* masm,
1497                                               const int stack_slots,
1498                                               const int total_in_args,
1499                                               const int arg_save_area,
1500                                               OopMapSet* oop_maps,
1501                                               VMRegPair* in_regs,
1502                                               BasicType* in_sig_bt,
1503                                               Register tmp_reg ) {
1504  __ block_comment("check GCLocker::needs_gc");
1505  Label cont;
1506  __ lbz(tmp_reg, (RegisterOrConstant)(intptr_t)GCLocker::needs_gc_address());
1507  __ cmplwi(CCR0, tmp_reg, 0);
1508  __ beq(CCR0, cont);
1509
1510  // Save down any values that are live in registers and call into the
1511  // runtime to halt for a GC.
1512  OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1513  save_or_restore_arguments(masm, stack_slots, total_in_args,
1514                            arg_save_area, map, in_regs, in_sig_bt);
1515
1516  __ mr(R3_ARG1, R16_thread);
1517  __ set_last_Java_frame(R1_SP, noreg);
1518
1519  __ block_comment("block_for_jni_critical");
1520  address entry_point = CAST_FROM_FN_PTR(address, SharedRuntime::block_for_jni_critical);
1521#if defined(ABI_ELFv2)
1522  __ call_c(entry_point, relocInfo::runtime_call_type);
1523#else
1524  __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::runtime_call_type);
1525#endif
1526  address start           = __ pc() - __ offset(),
1527          calls_return_pc = __ last_calls_return_pc();
1528  oop_maps->add_gc_map(calls_return_pc - start, map);
1529
1530  __ reset_last_Java_frame();
1531
1532  // Reload all the register arguments.
1533  save_or_restore_arguments(masm, stack_slots, total_in_args,
1534                            arg_save_area, NULL, in_regs, in_sig_bt);
1535
1536  __ BIND(cont);
1537
1538#ifdef ASSERT
1539  if (StressCriticalJNINatives) {
1540    // Stress register saving.
1541    OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1542    save_or_restore_arguments(masm, stack_slots, total_in_args,
1543                              arg_save_area, map, in_regs, in_sig_bt);
1544    // Destroy argument registers.
1545    for (int i = 0; i < total_in_args; i++) {
1546      if (in_regs[i].first()->is_Register()) {
1547        const Register reg = in_regs[i].first()->as_Register();
1548        __ neg(reg, reg);
1549      } else if (in_regs[i].first()->is_FloatRegister()) {
1550        __ fneg(in_regs[i].first()->as_FloatRegister(), in_regs[i].first()->as_FloatRegister());
1551      }
1552    }
1553
1554    save_or_restore_arguments(masm, stack_slots, total_in_args,
1555                              arg_save_area, NULL, in_regs, in_sig_bt);
1556  }
1557#endif
1558}
1559
1560static void move_ptr(MacroAssembler* masm, VMRegPair src, VMRegPair dst, Register r_caller_sp, Register r_temp) {
1561  if (src.first()->is_stack()) {
1562    if (dst.first()->is_stack()) {
1563      // stack to stack
1564      __ ld(r_temp, reg2offset(src.first()), r_caller_sp);
1565      __ std(r_temp, reg2offset(dst.first()), R1_SP);
1566    } else {
1567      // stack to reg
1568      __ ld(dst.first()->as_Register(), reg2offset(src.first()), r_caller_sp);
1569    }
1570  } else if (dst.first()->is_stack()) {
1571    // reg to stack
1572    __ std(src.first()->as_Register(), reg2offset(dst.first()), R1_SP);
1573  } else {
1574    if (dst.first() != src.first()) {
1575      __ mr(dst.first()->as_Register(), src.first()->as_Register());
1576    }
1577  }
1578}
1579
1580// Unpack an array argument into a pointer to the body and the length
1581// if the array is non-null, otherwise pass 0 for both.
1582static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type,
1583                                  VMRegPair body_arg, VMRegPair length_arg, Register r_caller_sp,
1584                                  Register tmp_reg, Register tmp2_reg) {
1585  assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg,
1586         "possible collision");
1587  assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg,
1588         "possible collision");
1589
1590  // Pass the length, ptr pair.
1591  Label set_out_args;
1592  VMRegPair tmp, tmp2;
1593  tmp.set_ptr(tmp_reg->as_VMReg());
1594  tmp2.set_ptr(tmp2_reg->as_VMReg());
1595  if (reg.first()->is_stack()) {
1596    // Load the arg up from the stack.
1597    move_ptr(masm, reg, tmp, r_caller_sp, /*unused*/ R0);
1598    reg = tmp;
1599  }
1600  __ li(tmp2_reg, 0); // Pass zeros if Array=null.
1601  if (tmp_reg != reg.first()->as_Register()) __ li(tmp_reg, 0);
1602  __ cmpdi(CCR0, reg.first()->as_Register(), 0);
1603  __ beq(CCR0, set_out_args);
1604  __ lwa(tmp2_reg, arrayOopDesc::length_offset_in_bytes(), reg.first()->as_Register());
1605  __ addi(tmp_reg, reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type));
1606  __ bind(set_out_args);
1607  move_ptr(masm, tmp, body_arg, r_caller_sp, /*unused*/ R0);
1608  move_ptr(masm, tmp2, length_arg, r_caller_sp, /*unused*/ R0); // Same as move32_64 on PPC64.
1609}
1610
1611static void verify_oop_args(MacroAssembler* masm,
1612                            methodHandle method,
1613                            const BasicType* sig_bt,
1614                            const VMRegPair* regs) {
1615  Register temp_reg = R19_method;  // not part of any compiled calling seq
1616  if (VerifyOops) {
1617    for (int i = 0; i < method->size_of_parameters(); i++) {
1618      if (sig_bt[i] == T_OBJECT ||
1619          sig_bt[i] == T_ARRAY) {
1620        VMReg r = regs[i].first();
1621        assert(r->is_valid(), "bad oop arg");
1622        if (r->is_stack()) {
1623          __ ld(temp_reg, reg2offset(r), R1_SP);
1624          __ verify_oop(temp_reg);
1625        } else {
1626          __ verify_oop(r->as_Register());
1627        }
1628      }
1629    }
1630  }
1631}
1632
1633static void gen_special_dispatch(MacroAssembler* masm,
1634                                 methodHandle method,
1635                                 const BasicType* sig_bt,
1636                                 const VMRegPair* regs) {
1637  verify_oop_args(masm, method, sig_bt, regs);
1638  vmIntrinsics::ID iid = method->intrinsic_id();
1639
1640  // Now write the args into the outgoing interpreter space
1641  bool     has_receiver   = false;
1642  Register receiver_reg   = noreg;
1643  int      member_arg_pos = -1;
1644  Register member_reg     = noreg;
1645  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1646  if (ref_kind != 0) {
1647    member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1648    member_reg = R19_method;  // known to be free at this point
1649    has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1650  } else if (iid == vmIntrinsics::_invokeBasic) {
1651    has_receiver = true;
1652  } else {
1653    fatal("unexpected intrinsic id %d", iid);
1654  }
1655
1656  if (member_reg != noreg) {
1657    // Load the member_arg into register, if necessary.
1658    SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1659    VMReg r = regs[member_arg_pos].first();
1660    if (r->is_stack()) {
1661      __ ld(member_reg, reg2offset(r), R1_SP);
1662    } else {
1663      // no data motion is needed
1664      member_reg = r->as_Register();
1665    }
1666  }
1667
1668  if (has_receiver) {
1669    // Make sure the receiver is loaded into a register.
1670    assert(method->size_of_parameters() > 0, "oob");
1671    assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1672    VMReg r = regs[0].first();
1673    assert(r->is_valid(), "bad receiver arg");
1674    if (r->is_stack()) {
1675      // Porting note:  This assumes that compiled calling conventions always
1676      // pass the receiver oop in a register.  If this is not true on some
1677      // platform, pick a temp and load the receiver from stack.
1678      fatal("receiver always in a register");
1679      receiver_reg = R11_scratch1;  // TODO (hs24): is R11_scratch1 really free at this point?
1680      __ ld(receiver_reg, reg2offset(r), R1_SP);
1681    } else {
1682      // no data motion is needed
1683      receiver_reg = r->as_Register();
1684    }
1685  }
1686
1687  // Figure out which address we are really jumping to:
1688  MethodHandles::generate_method_handle_dispatch(masm, iid,
1689                                                 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1690}
1691
1692#endif // COMPILER2
1693
1694// ---------------------------------------------------------------------------
1695// Generate a native wrapper for a given method. The method takes arguments
1696// in the Java compiled code convention, marshals them to the native
1697// convention (handlizes oops, etc), transitions to native, makes the call,
1698// returns to java state (possibly blocking), unhandlizes any result and
1699// returns.
1700//
1701// Critical native functions are a shorthand for the use of
1702// GetPrimtiveArrayCritical and disallow the use of any other JNI
1703// functions.  The wrapper is expected to unpack the arguments before
1704// passing them to the callee and perform checks before and after the
1705// native call to ensure that they GCLocker
1706// lock_critical/unlock_critical semantics are followed.  Some other
1707// parts of JNI setup are skipped like the tear down of the JNI handle
1708// block and the check for pending exceptions it's impossible for them
1709// to be thrown.
1710//
1711// They are roughly structured like this:
1712//   if (GCLocker::needs_gc())
1713//     SharedRuntime::block_for_jni_critical();
1714//   tranistion to thread_in_native
1715//   unpack arrray arguments and call native entry point
1716//   check for safepoint in progress
1717//   check if any thread suspend flags are set
1718//     call into JVM and possible unlock the JNI critical
1719//     if a GC was suppressed while in the critical native.
1720//   transition back to thread_in_Java
1721//   return to caller
1722//
1723nmethod *SharedRuntime::generate_native_wrapper(MacroAssembler *masm,
1724                                                const methodHandle& method,
1725                                                int compile_id,
1726                                                BasicType *in_sig_bt,
1727                                                VMRegPair *in_regs,
1728                                                BasicType ret_type) {
1729#ifdef COMPILER2
1730  if (method->is_method_handle_intrinsic()) {
1731    vmIntrinsics::ID iid = method->intrinsic_id();
1732    intptr_t start = (intptr_t)__ pc();
1733    int vep_offset = ((intptr_t)__ pc()) - start;
1734    gen_special_dispatch(masm,
1735                         method,
1736                         in_sig_bt,
1737                         in_regs);
1738    int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1739    __ flush();
1740    int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1741    return nmethod::new_native_nmethod(method,
1742                                       compile_id,
1743                                       masm->code(),
1744                                       vep_offset,
1745                                       frame_complete,
1746                                       stack_slots / VMRegImpl::slots_per_word,
1747                                       in_ByteSize(-1),
1748                                       in_ByteSize(-1),
1749                                       (OopMapSet*)NULL);
1750  }
1751
1752  bool is_critical_native = true;
1753  address native_func = method->critical_native_function();
1754  if (native_func == NULL) {
1755    native_func = method->native_function();
1756    is_critical_native = false;
1757  }
1758  assert(native_func != NULL, "must have function");
1759
1760  // First, create signature for outgoing C call
1761  // --------------------------------------------------------------------------
1762
1763  int total_in_args = method->size_of_parameters();
1764  // We have received a description of where all the java args are located
1765  // on entry to the wrapper. We need to convert these args to where
1766  // the jni function will expect them. To figure out where they go
1767  // we convert the java signature to a C signature by inserting
1768  // the hidden arguments as arg[0] and possibly arg[1] (static method)
1769
1770  // Calculate the total number of C arguments and create arrays for the
1771  // signature and the outgoing registers.
1772  // On ppc64, we have two arrays for the outgoing registers, because
1773  // some floating-point arguments must be passed in registers _and_
1774  // in stack locations.
1775  bool method_is_static = method->is_static();
1776  int  total_c_args     = total_in_args;
1777
1778  if (!is_critical_native) {
1779    int n_hidden_args = method_is_static ? 2 : 1;
1780    total_c_args += n_hidden_args;
1781  } else {
1782    // No JNIEnv*, no this*, but unpacked arrays (base+length).
1783    for (int i = 0; i < total_in_args; i++) {
1784      if (in_sig_bt[i] == T_ARRAY) {
1785        total_c_args++;
1786      }
1787    }
1788  }
1789
1790  BasicType *out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1791  VMRegPair *out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1792  VMRegPair *out_regs2  = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1793  BasicType* in_elem_bt = NULL;
1794
1795  // Create the signature for the C call:
1796  //   1) add the JNIEnv*
1797  //   2) add the class if the method is static
1798  //   3) copy the rest of the incoming signature (shifted by the number of
1799  //      hidden arguments).
1800
1801  int argc = 0;
1802  if (!is_critical_native) {
1803    out_sig_bt[argc++] = T_ADDRESS;
1804    if (method->is_static()) {
1805      out_sig_bt[argc++] = T_OBJECT;
1806    }
1807
1808    for (int i = 0; i < total_in_args ; i++ ) {
1809      out_sig_bt[argc++] = in_sig_bt[i];
1810    }
1811  } else {
1812    Thread* THREAD = Thread::current();
1813    in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1814    SignatureStream ss(method->signature());
1815    int o = 0;
1816    for (int i = 0; i < total_in_args ; i++, o++) {
1817      if (in_sig_bt[i] == T_ARRAY) {
1818        // Arrays are passed as int, elem* pair
1819        Symbol* atype = ss.as_symbol(CHECK_NULL);
1820        const char* at = atype->as_C_string();
1821        if (strlen(at) == 2) {
1822          assert(at[0] == '[', "must be");
1823          switch (at[1]) {
1824            case 'B': in_elem_bt[o] = T_BYTE; break;
1825            case 'C': in_elem_bt[o] = T_CHAR; break;
1826            case 'D': in_elem_bt[o] = T_DOUBLE; break;
1827            case 'F': in_elem_bt[o] = T_FLOAT; break;
1828            case 'I': in_elem_bt[o] = T_INT; break;
1829            case 'J': in_elem_bt[o] = T_LONG; break;
1830            case 'S': in_elem_bt[o] = T_SHORT; break;
1831            case 'Z': in_elem_bt[o] = T_BOOLEAN; break;
1832            default: ShouldNotReachHere();
1833          }
1834        }
1835      } else {
1836        in_elem_bt[o] = T_VOID;
1837      }
1838      if (in_sig_bt[i] != T_VOID) {
1839        assert(in_sig_bt[i] == ss.type(), "must match");
1840        ss.next();
1841      }
1842    }
1843
1844    for (int i = 0; i < total_in_args ; i++ ) {
1845      if (in_sig_bt[i] == T_ARRAY) {
1846        // Arrays are passed as int, elem* pair.
1847        out_sig_bt[argc++] = T_INT;
1848        out_sig_bt[argc++] = T_ADDRESS;
1849      } else {
1850        out_sig_bt[argc++] = in_sig_bt[i];
1851      }
1852    }
1853  }
1854
1855
1856  // Compute the wrapper's frame size.
1857  // --------------------------------------------------------------------------
1858
1859  // Now figure out where the args must be stored and how much stack space
1860  // they require.
1861  //
1862  // Compute framesize for the wrapper. We need to handlize all oops in
1863  // incoming registers.
1864  //
1865  // Calculate the total number of stack slots we will need:
1866  //   1) abi requirements
1867  //   2) outgoing arguments
1868  //   3) space for inbound oop handle area
1869  //   4) space for handlizing a klass if static method
1870  //   5) space for a lock if synchronized method
1871  //   6) workspace for saving return values, int <-> float reg moves, etc.
1872  //   7) alignment
1873  //
1874  // Layout of the native wrapper frame:
1875  // (stack grows upwards, memory grows downwards)
1876  //
1877  // NW     [ABI_REG_ARGS]             <-- 1) R1_SP
1878  //        [outgoing arguments]       <-- 2) R1_SP + out_arg_slot_offset
1879  //        [oopHandle area]           <-- 3) R1_SP + oop_handle_offset (save area for critical natives)
1880  //        klass                      <-- 4) R1_SP + klass_offset
1881  //        lock                       <-- 5) R1_SP + lock_offset
1882  //        [workspace]                <-- 6) R1_SP + workspace_offset
1883  //        [alignment] (optional)     <-- 7)
1884  // caller [JIT_TOP_ABI_48]           <-- r_callers_sp
1885  //
1886  // - *_slot_offset Indicates offset from SP in number of stack slots.
1887  // - *_offset      Indicates offset from SP in bytes.
1888
1889  int stack_slots = c_calling_convention(out_sig_bt, out_regs, out_regs2, total_c_args) // 1+2)
1890                  + SharedRuntime::out_preserve_stack_slots(); // See c_calling_convention.
1891
1892  // Now the space for the inbound oop handle area.
1893  int total_save_slots = num_java_iarg_registers * VMRegImpl::slots_per_word;
1894  if (is_critical_native) {
1895    // Critical natives may have to call out so they need a save area
1896    // for register arguments.
1897    int double_slots = 0;
1898    int single_slots = 0;
1899    for (int i = 0; i < total_in_args; i++) {
1900      if (in_regs[i].first()->is_Register()) {
1901        const Register reg = in_regs[i].first()->as_Register();
1902        switch (in_sig_bt[i]) {
1903          case T_BOOLEAN:
1904          case T_BYTE:
1905          case T_SHORT:
1906          case T_CHAR:
1907          case T_INT:
1908          // Fall through.
1909          case T_ARRAY:
1910          case T_LONG: double_slots++; break;
1911          default:  ShouldNotReachHere();
1912        }
1913      } else if (in_regs[i].first()->is_FloatRegister()) {
1914        switch (in_sig_bt[i]) {
1915          case T_FLOAT:  single_slots++; break;
1916          case T_DOUBLE: double_slots++; break;
1917          default:  ShouldNotReachHere();
1918        }
1919      }
1920    }
1921    total_save_slots = double_slots * 2 + round_to(single_slots, 2); // round to even
1922  }
1923
1924  int oop_handle_slot_offset = stack_slots;
1925  stack_slots += total_save_slots;                                                // 3)
1926
1927  int klass_slot_offset = 0;
1928  int klass_offset      = -1;
1929  if (method_is_static && !is_critical_native) {                                  // 4)
1930    klass_slot_offset  = stack_slots;
1931    klass_offset       = klass_slot_offset * VMRegImpl::stack_slot_size;
1932    stack_slots       += VMRegImpl::slots_per_word;
1933  }
1934
1935  int lock_slot_offset = 0;
1936  int lock_offset      = -1;
1937  if (method->is_synchronized()) {                                                // 5)
1938    lock_slot_offset   = stack_slots;
1939    lock_offset        = lock_slot_offset * VMRegImpl::stack_slot_size;
1940    stack_slots       += VMRegImpl::slots_per_word;
1941  }
1942
1943  int workspace_slot_offset = stack_slots;                                        // 6)
1944  stack_slots         += 2;
1945
1946  // Now compute actual number of stack words we need.
1947  // Rounding to make stack properly aligned.
1948  stack_slots = round_to(stack_slots,                                             // 7)
1949                         frame::alignment_in_bytes / VMRegImpl::stack_slot_size);
1950  int frame_size_in_bytes = stack_slots * VMRegImpl::stack_slot_size;
1951
1952
1953  // Now we can start generating code.
1954  // --------------------------------------------------------------------------
1955
1956  intptr_t start_pc = (intptr_t)__ pc();
1957  intptr_t vep_start_pc;
1958  intptr_t frame_done_pc;
1959  intptr_t oopmap_pc;
1960
1961  Label    ic_miss;
1962  Label    handle_pending_exception;
1963
1964  Register r_callers_sp = R21;
1965  Register r_temp_1     = R22;
1966  Register r_temp_2     = R23;
1967  Register r_temp_3     = R24;
1968  Register r_temp_4     = R25;
1969  Register r_temp_5     = R26;
1970  Register r_temp_6     = R27;
1971  Register r_return_pc  = R28;
1972
1973  Register r_carg1_jnienv        = noreg;
1974  Register r_carg2_classorobject = noreg;
1975  if (!is_critical_native) {
1976    r_carg1_jnienv        = out_regs[0].first()->as_Register();
1977    r_carg2_classorobject = out_regs[1].first()->as_Register();
1978  }
1979
1980
1981  // Generate the Unverified Entry Point (UEP).
1982  // --------------------------------------------------------------------------
1983  assert(start_pc == (intptr_t)__ pc(), "uep must be at start");
1984
1985  // Check ic: object class == cached class?
1986  if (!method_is_static) {
1987  Register ic = as_Register(Matcher::inline_cache_reg_encode());
1988  Register receiver_klass = r_temp_1;
1989
1990  __ cmpdi(CCR0, R3_ARG1, 0);
1991  __ beq(CCR0, ic_miss);
1992  __ verify_oop(R3_ARG1);
1993  __ load_klass(receiver_klass, R3_ARG1);
1994
1995  __ cmpd(CCR0, receiver_klass, ic);
1996  __ bne(CCR0, ic_miss);
1997  }
1998
1999
2000  // Generate the Verified Entry Point (VEP).
2001  // --------------------------------------------------------------------------
2002  vep_start_pc = (intptr_t)__ pc();
2003
2004  __ save_LR_CR(r_temp_1);
2005  __ generate_stack_overflow_check(frame_size_in_bytes); // Check before creating frame.
2006  __ mr(r_callers_sp, R1_SP);                            // Remember frame pointer.
2007  __ push_frame(frame_size_in_bytes, r_temp_1);          // Push the c2n adapter's frame.
2008  frame_done_pc = (intptr_t)__ pc();
2009
2010  __ verify_thread();
2011
2012  // Native nmethod wrappers never take possesion of the oop arguments.
2013  // So the caller will gc the arguments.
2014  // The only thing we need an oopMap for is if the call is static.
2015  //
2016  // An OopMap for lock (and class if static), and one for the VM call itself.
2017  OopMapSet *oop_maps = new OopMapSet();
2018  OopMap    *oop_map  = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2019
2020  if (is_critical_native) {
2021    check_needs_gc_for_critical_native(masm, stack_slots, total_in_args, oop_handle_slot_offset, oop_maps, in_regs, in_sig_bt, r_temp_1);
2022  }
2023
2024  // Move arguments from register/stack to register/stack.
2025  // --------------------------------------------------------------------------
2026  //
2027  // We immediately shuffle the arguments so that for any vm call we have
2028  // to make from here on out (sync slow path, jvmti, etc.) we will have
2029  // captured the oops from our caller and have a valid oopMap for them.
2030  //
2031  // Natives require 1 or 2 extra arguments over the normal ones: the JNIEnv*
2032  // (derived from JavaThread* which is in R16_thread) and, if static,
2033  // the class mirror instead of a receiver. This pretty much guarantees that
2034  // register layout will not match. We ignore these extra arguments during
2035  // the shuffle. The shuffle is described by the two calling convention
2036  // vectors we have in our possession. We simply walk the java vector to
2037  // get the source locations and the c vector to get the destinations.
2038
2039  // Record sp-based slot for receiver on stack for non-static methods.
2040  int receiver_offset = -1;
2041
2042  // We move the arguments backward because the floating point registers
2043  // destination will always be to a register with a greater or equal
2044  // register number or the stack.
2045  //   in  is the index of the incoming Java arguments
2046  //   out is the index of the outgoing C arguments
2047
2048#ifdef ASSERT
2049  bool reg_destroyed[RegisterImpl::number_of_registers];
2050  bool freg_destroyed[FloatRegisterImpl::number_of_registers];
2051  for (int r = 0 ; r < RegisterImpl::number_of_registers ; r++) {
2052    reg_destroyed[r] = false;
2053  }
2054  for (int f = 0 ; f < FloatRegisterImpl::number_of_registers ; f++) {
2055    freg_destroyed[f] = false;
2056  }
2057#endif // ASSERT
2058
2059  for (int in = total_in_args - 1, out = total_c_args - 1; in >= 0 ; in--, out--) {
2060
2061#ifdef ASSERT
2062    if (in_regs[in].first()->is_Register()) {
2063      assert(!reg_destroyed[in_regs[in].first()->as_Register()->encoding()], "ack!");
2064    } else if (in_regs[in].first()->is_FloatRegister()) {
2065      assert(!freg_destroyed[in_regs[in].first()->as_FloatRegister()->encoding()], "ack!");
2066    }
2067    if (out_regs[out].first()->is_Register()) {
2068      reg_destroyed[out_regs[out].first()->as_Register()->encoding()] = true;
2069    } else if (out_regs[out].first()->is_FloatRegister()) {
2070      freg_destroyed[out_regs[out].first()->as_FloatRegister()->encoding()] = true;
2071    }
2072    if (out_regs2[out].first()->is_Register()) {
2073      reg_destroyed[out_regs2[out].first()->as_Register()->encoding()] = true;
2074    } else if (out_regs2[out].first()->is_FloatRegister()) {
2075      freg_destroyed[out_regs2[out].first()->as_FloatRegister()->encoding()] = true;
2076    }
2077#endif // ASSERT
2078
2079    switch (in_sig_bt[in]) {
2080      case T_BOOLEAN:
2081      case T_CHAR:
2082      case T_BYTE:
2083      case T_SHORT:
2084      case T_INT:
2085        // Move int and do sign extension.
2086        int_move(masm, in_regs[in], out_regs[out], r_callers_sp, r_temp_1);
2087        break;
2088      case T_LONG:
2089        long_move(masm, in_regs[in], out_regs[out], r_callers_sp, r_temp_1);
2090        break;
2091      case T_ARRAY:
2092        if (is_critical_native) {
2093          int body_arg = out;
2094          out -= 1; // Point to length arg.
2095          unpack_array_argument(masm, in_regs[in], in_elem_bt[in], out_regs[body_arg], out_regs[out],
2096                                r_callers_sp, r_temp_1, r_temp_2);
2097          break;
2098        }
2099      case T_OBJECT:
2100        assert(!is_critical_native, "no oop arguments");
2101        object_move(masm, stack_slots,
2102                    oop_map, oop_handle_slot_offset,
2103                    ((in == 0) && (!method_is_static)), &receiver_offset,
2104                    in_regs[in], out_regs[out],
2105                    r_callers_sp, r_temp_1, r_temp_2);
2106        break;
2107      case T_VOID:
2108        break;
2109      case T_FLOAT:
2110        float_move(masm, in_regs[in], out_regs[out], r_callers_sp, r_temp_1);
2111        if (out_regs2[out].first()->is_valid()) {
2112          float_move(masm, in_regs[in], out_regs2[out], r_callers_sp, r_temp_1);
2113        }
2114        break;
2115      case T_DOUBLE:
2116        double_move(masm, in_regs[in], out_regs[out], r_callers_sp, r_temp_1);
2117        if (out_regs2[out].first()->is_valid()) {
2118          double_move(masm, in_regs[in], out_regs2[out], r_callers_sp, r_temp_1);
2119        }
2120        break;
2121      case T_ADDRESS:
2122        fatal("found type (T_ADDRESS) in java args");
2123        break;
2124      default:
2125        ShouldNotReachHere();
2126        break;
2127    }
2128  }
2129
2130  // Pre-load a static method's oop into ARG2.
2131  // Used both by locking code and the normal JNI call code.
2132  if (method_is_static && !is_critical_native) {
2133    __ set_oop_constant(JNIHandles::make_local(method->method_holder()->java_mirror()),
2134                        r_carg2_classorobject);
2135
2136    // Now handlize the static class mirror in carg2. It's known not-null.
2137    __ std(r_carg2_classorobject, klass_offset, R1_SP);
2138    oop_map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2139    __ addi(r_carg2_classorobject, R1_SP, klass_offset);
2140  }
2141
2142  // Get JNIEnv* which is first argument to native.
2143  if (!is_critical_native) {
2144    __ addi(r_carg1_jnienv, R16_thread, in_bytes(JavaThread::jni_environment_offset()));
2145  }
2146
2147  // NOTE:
2148  //
2149  // We have all of the arguments setup at this point.
2150  // We MUST NOT touch any outgoing regs from this point on.
2151  // So if we must call out we must push a new frame.
2152
2153  // Get current pc for oopmap, and load it patchable relative to global toc.
2154  oopmap_pc = (intptr_t) __ pc();
2155  __ calculate_address_from_global_toc(r_return_pc, (address)oopmap_pc, true, true, true, true);
2156
2157  // We use the same pc/oopMap repeatedly when we call out.
2158  oop_maps->add_gc_map(oopmap_pc - start_pc, oop_map);
2159
2160  // r_return_pc now has the pc loaded that we will use when we finally call
2161  // to native.
2162
2163  // Make sure that thread is non-volatile; it crosses a bunch of VM calls below.
2164  assert(R16_thread->is_nonvolatile(), "thread must be in non-volatile register");
2165
2166# if 0
2167  // DTrace method entry
2168# endif
2169
2170  // Lock a synchronized method.
2171  // --------------------------------------------------------------------------
2172
2173  if (method->is_synchronized()) {
2174    assert(!is_critical_native, "unhandled");
2175    ConditionRegister r_flag = CCR1;
2176    Register          r_oop  = r_temp_4;
2177    const Register    r_box  = r_temp_5;
2178    Label             done, locked;
2179
2180    // Load the oop for the object or class. r_carg2_classorobject contains
2181    // either the handlized oop from the incoming arguments or the handlized
2182    // class mirror (if the method is static).
2183    __ ld(r_oop, 0, r_carg2_classorobject);
2184
2185    // Get the lock box slot's address.
2186    __ addi(r_box, R1_SP, lock_offset);
2187
2188#   ifdef ASSERT
2189    if (UseBiasedLocking) {
2190      // Making the box point to itself will make it clear it went unused
2191      // but also be obviously invalid.
2192      __ std(r_box, 0, r_box);
2193    }
2194#   endif // ASSERT
2195
2196    // Try fastpath for locking.
2197    // fast_lock kills r_temp_1, r_temp_2, r_temp_3.
2198    __ compiler_fast_lock_object(r_flag, r_oop, r_box, r_temp_1, r_temp_2, r_temp_3);
2199    __ beq(r_flag, locked);
2200
2201    // None of the above fast optimizations worked so we have to get into the
2202    // slow case of monitor enter. Inline a special case of call_VM that
2203    // disallows any pending_exception.
2204
2205    // Save argument registers and leave room for C-compatible ABI_REG_ARGS.
2206    int frame_size = frame::abi_reg_args_size +
2207                     round_to(total_c_args * wordSize, frame::alignment_in_bytes);
2208    __ mr(R11_scratch1, R1_SP);
2209    RegisterSaver::push_frame_and_save_argument_registers(masm, R12_scratch2, frame_size, total_c_args, out_regs, out_regs2);
2210
2211    // Do the call.
2212    __ set_last_Java_frame(R11_scratch1, r_return_pc);
2213    assert(r_return_pc->is_nonvolatile(), "expecting return pc to be in non-volatile register");
2214    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), r_oop, r_box, R16_thread);
2215    __ reset_last_Java_frame();
2216
2217    RegisterSaver::restore_argument_registers_and_pop_frame(masm, frame_size, total_c_args, out_regs, out_regs2);
2218
2219    __ asm_assert_mem8_is_zero(thread_(pending_exception),
2220       "no pending exception allowed on exit from SharedRuntime::complete_monitor_locking_C", 0);
2221
2222    __ bind(locked);
2223  }
2224
2225
2226  // Publish thread state
2227  // --------------------------------------------------------------------------
2228
2229  // Use that pc we placed in r_return_pc a while back as the current frame anchor.
2230  __ set_last_Java_frame(R1_SP, r_return_pc);
2231
2232  // Transition from _thread_in_Java to _thread_in_native.
2233  __ li(R0, _thread_in_native);
2234  __ release();
2235  // TODO: PPC port assert(4 == JavaThread::sz_thread_state(), "unexpected field size");
2236  __ stw(R0, thread_(thread_state));
2237  if (UseMembar) {
2238    __ fence();
2239  }
2240
2241
2242  // The JNI call
2243  // --------------------------------------------------------------------------
2244#if defined(ABI_ELFv2)
2245  __ call_c(native_func, relocInfo::runtime_call_type);
2246#else
2247  FunctionDescriptor* fd_native_method = (FunctionDescriptor*) native_func;
2248  __ call_c(fd_native_method, relocInfo::runtime_call_type);
2249#endif
2250
2251
2252  // Now, we are back from the native code.
2253
2254
2255  // Unpack the native result.
2256  // --------------------------------------------------------------------------
2257
2258  // For int-types, we do any needed sign-extension required.
2259  // Care must be taken that the return values (R3_RET and F1_RET)
2260  // will survive any VM calls for blocking or unlocking.
2261  // An OOP result (handle) is done specially in the slow-path code.
2262
2263  switch (ret_type) {
2264    case T_VOID:    break;        // Nothing to do!
2265    case T_FLOAT:   break;        // Got it where we want it (unless slow-path).
2266    case T_DOUBLE:  break;        // Got it where we want it (unless slow-path).
2267    case T_LONG:    break;        // Got it where we want it (unless slow-path).
2268    case T_OBJECT:  break;        // Really a handle.
2269                                  // Cannot de-handlize until after reclaiming jvm_lock.
2270    case T_ARRAY:   break;
2271
2272    case T_BOOLEAN: {             // 0 -> false(0); !0 -> true(1)
2273      Label skip_modify;
2274      __ cmpwi(CCR0, R3_RET, 0);
2275      __ beq(CCR0, skip_modify);
2276      __ li(R3_RET, 1);
2277      __ bind(skip_modify);
2278      break;
2279      }
2280    case T_BYTE: {                // sign extension
2281      __ extsb(R3_RET, R3_RET);
2282      break;
2283      }
2284    case T_CHAR: {                // unsigned result
2285      __ andi(R3_RET, R3_RET, 0xffff);
2286      break;
2287      }
2288    case T_SHORT: {               // sign extension
2289      __ extsh(R3_RET, R3_RET);
2290      break;
2291      }
2292    case T_INT:                   // nothing to do
2293      break;
2294    default:
2295      ShouldNotReachHere();
2296      break;
2297  }
2298
2299
2300  // Publish thread state
2301  // --------------------------------------------------------------------------
2302
2303  // Switch thread to "native transition" state before reading the
2304  // synchronization state. This additional state is necessary because reading
2305  // and testing the synchronization state is not atomic w.r.t. GC, as this
2306  // scenario demonstrates:
2307  //   - Java thread A, in _thread_in_native state, loads _not_synchronized
2308  //     and is preempted.
2309  //   - VM thread changes sync state to synchronizing and suspends threads
2310  //     for GC.
2311  //   - Thread A is resumed to finish this native method, but doesn't block
2312  //     here since it didn't see any synchronization in progress, and escapes.
2313
2314  // Transition from _thread_in_native to _thread_in_native_trans.
2315  __ li(R0, _thread_in_native_trans);
2316  __ release();
2317  // TODO: PPC port assert(4 == JavaThread::sz_thread_state(), "unexpected field size");
2318  __ stw(R0, thread_(thread_state));
2319
2320
2321  // Must we block?
2322  // --------------------------------------------------------------------------
2323
2324  // Block, if necessary, before resuming in _thread_in_Java state.
2325  // In order for GC to work, don't clear the last_Java_sp until after blocking.
2326  Label after_transition;
2327  {
2328    Label no_block, sync;
2329
2330    if (os::is_MP()) {
2331      if (UseMembar) {
2332        // Force this write out before the read below.
2333        __ fence();
2334      } else {
2335        // Write serialization page so VM thread can do a pseudo remote membar.
2336        // We use the current thread pointer to calculate a thread specific
2337        // offset to write to within the page. This minimizes bus traffic
2338        // due to cache line collision.
2339        __ serialize_memory(R16_thread, r_temp_4, r_temp_5);
2340      }
2341    }
2342
2343    Register sync_state_addr = r_temp_4;
2344    Register sync_state      = r_temp_5;
2345    Register suspend_flags   = r_temp_6;
2346
2347    __ load_const(sync_state_addr, SafepointSynchronize::address_of_state(), /*temp*/ sync_state);
2348
2349    // TODO: PPC port assert(4 == SafepointSynchronize::sz_state(), "unexpected field size");
2350    __ lwz(sync_state, 0, sync_state_addr);
2351
2352    // TODO: PPC port assert(4 == Thread::sz_suspend_flags(), "unexpected field size");
2353    __ lwz(suspend_flags, thread_(suspend_flags));
2354
2355    __ acquire();
2356
2357    Label do_safepoint;
2358    // No synchronization in progress nor yet synchronized.
2359    __ cmpwi(CCR0, sync_state, SafepointSynchronize::_not_synchronized);
2360    // Not suspended.
2361    __ cmpwi(CCR1, suspend_flags, 0);
2362
2363    __ bne(CCR0, sync);
2364    __ beq(CCR1, no_block);
2365
2366    // Block. Save any potential method result value before the operation and
2367    // use a leaf call to leave the last_Java_frame setup undisturbed. Doing this
2368    // lets us share the oopMap we used when we went native rather than create
2369    // a distinct one for this pc.
2370    __ bind(sync);
2371
2372    address entry_point = is_critical_native
2373      ? CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition)
2374      : CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans);
2375    save_native_result(masm, ret_type, workspace_slot_offset);
2376    __ call_VM_leaf(entry_point, R16_thread);
2377    restore_native_result(masm, ret_type, workspace_slot_offset);
2378
2379    if (is_critical_native) {
2380      __ b(after_transition); // No thread state transition here.
2381    }
2382    __ bind(no_block);
2383  }
2384
2385  // Publish thread state.
2386  // --------------------------------------------------------------------------
2387
2388  // Thread state is thread_in_native_trans. Any safepoint blocking has
2389  // already happened so we can now change state to _thread_in_Java.
2390
2391  // Transition from _thread_in_native_trans to _thread_in_Java.
2392  __ li(R0, _thread_in_Java);
2393  __ release();
2394  // TODO: PPC port assert(4 == JavaThread::sz_thread_state(), "unexpected field size");
2395  __ stw(R0, thread_(thread_state));
2396  if (UseMembar) {
2397    __ fence();
2398  }
2399  __ bind(after_transition);
2400
2401  // Reguard any pages if necessary.
2402  // --------------------------------------------------------------------------
2403
2404  Label no_reguard;
2405  __ lwz(r_temp_1, thread_(stack_guard_state));
2406  __ cmpwi(CCR0, r_temp_1, JavaThread::stack_guard_yellow_reserved_disabled);
2407  __ bne(CCR0, no_reguard);
2408
2409  save_native_result(masm, ret_type, workspace_slot_offset);
2410  __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages));
2411  restore_native_result(masm, ret_type, workspace_slot_offset);
2412
2413  __ bind(no_reguard);
2414
2415
2416  // Unlock
2417  // --------------------------------------------------------------------------
2418
2419  if (method->is_synchronized()) {
2420
2421    ConditionRegister r_flag   = CCR1;
2422    const Register r_oop       = r_temp_4;
2423    const Register r_box       = r_temp_5;
2424    const Register r_exception = r_temp_6;
2425    Label done;
2426
2427    // Get oop and address of lock object box.
2428    if (method_is_static) {
2429      assert(klass_offset != -1, "");
2430      __ ld(r_oop, klass_offset, R1_SP);
2431    } else {
2432      assert(receiver_offset != -1, "");
2433      __ ld(r_oop, receiver_offset, R1_SP);
2434    }
2435    __ addi(r_box, R1_SP, lock_offset);
2436
2437    // Try fastpath for unlocking.
2438    __ compiler_fast_unlock_object(r_flag, r_oop, r_box, r_temp_1, r_temp_2, r_temp_3);
2439    __ beq(r_flag, done);
2440
2441    // Save and restore any potential method result value around the unlocking operation.
2442    save_native_result(masm, ret_type, workspace_slot_offset);
2443
2444    // Must save pending exception around the slow-path VM call. Since it's a
2445    // leaf call, the pending exception (if any) can be kept in a register.
2446    __ ld(r_exception, thread_(pending_exception));
2447    assert(r_exception->is_nonvolatile(), "exception register must be non-volatile");
2448    __ li(R0, 0);
2449    __ std(R0, thread_(pending_exception));
2450
2451    // Slow case of monitor enter.
2452    // Inline a special case of call_VM that disallows any pending_exception.
2453    // Arguments are (oop obj, BasicLock* lock, JavaThread* thread).
2454    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), r_oop, r_box, R16_thread);
2455
2456    __ asm_assert_mem8_is_zero(thread_(pending_exception),
2457       "no pending exception allowed on exit from SharedRuntime::complete_monitor_unlocking_C", 0);
2458
2459    restore_native_result(masm, ret_type, workspace_slot_offset);
2460
2461    // Check_forward_pending_exception jump to forward_exception if any pending
2462    // exception is set. The forward_exception routine expects to see the
2463    // exception in pending_exception and not in a register. Kind of clumsy,
2464    // since all folks who branch to forward_exception must have tested
2465    // pending_exception first and hence have it in a register already.
2466    __ std(r_exception, thread_(pending_exception));
2467
2468    __ bind(done);
2469  }
2470
2471# if 0
2472  // DTrace method exit
2473# endif
2474
2475  // Clear "last Java frame" SP and PC.
2476  // --------------------------------------------------------------------------
2477
2478  __ reset_last_Java_frame();
2479
2480  // Unbox oop result, e.g. JNIHandles::resolve value.
2481  // --------------------------------------------------------------------------
2482
2483  if (ret_type == T_OBJECT || ret_type == T_ARRAY) {
2484    __ resolve_jobject(R3_RET, r_temp_1, r_temp_2, /* needs_frame */ false); // kills R31
2485  }
2486
2487  if (CheckJNICalls) {
2488    // clear_pending_jni_exception_check
2489    __ load_const_optimized(R0, 0L);
2490    __ st_ptr(R0, JavaThread::pending_jni_exception_check_fn_offset(), R16_thread);
2491  }
2492
2493  // Reset handle block.
2494  // --------------------------------------------------------------------------
2495  if (!is_critical_native) {
2496  __ ld(r_temp_1, thread_(active_handles));
2497  // TODO: PPC port assert(4 == JNIHandleBlock::top_size_in_bytes(), "unexpected field size");
2498  __ li(r_temp_2, 0);
2499  __ stw(r_temp_2, JNIHandleBlock::top_offset_in_bytes(), r_temp_1);
2500
2501
2502  // Check for pending exceptions.
2503  // --------------------------------------------------------------------------
2504  __ ld(r_temp_2, thread_(pending_exception));
2505  __ cmpdi(CCR0, r_temp_2, 0);
2506  __ bne(CCR0, handle_pending_exception);
2507  }
2508
2509  // Return
2510  // --------------------------------------------------------------------------
2511
2512  __ pop_frame();
2513  __ restore_LR_CR(R11);
2514  __ blr();
2515
2516
2517  // Handler for pending exceptions (out-of-line).
2518  // --------------------------------------------------------------------------
2519
2520  // Since this is a native call, we know the proper exception handler
2521  // is the empty function. We just pop this frame and then jump to
2522  // forward_exception_entry.
2523  if (!is_critical_native) {
2524  __ align(InteriorEntryAlignment);
2525  __ bind(handle_pending_exception);
2526
2527  __ pop_frame();
2528  __ restore_LR_CR(R11);
2529  __ b64_patchable((address)StubRoutines::forward_exception_entry(),
2530                       relocInfo::runtime_call_type);
2531  }
2532
2533  // Handler for a cache miss (out-of-line).
2534  // --------------------------------------------------------------------------
2535
2536  if (!method_is_static) {
2537  __ align(InteriorEntryAlignment);
2538  __ bind(ic_miss);
2539
2540  __ b64_patchable((address)SharedRuntime::get_ic_miss_stub(),
2541                       relocInfo::runtime_call_type);
2542  }
2543
2544  // Done.
2545  // --------------------------------------------------------------------------
2546
2547  __ flush();
2548
2549  nmethod *nm = nmethod::new_native_nmethod(method,
2550                                            compile_id,
2551                                            masm->code(),
2552                                            vep_start_pc-start_pc,
2553                                            frame_done_pc-start_pc,
2554                                            stack_slots / VMRegImpl::slots_per_word,
2555                                            (method_is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2556                                            in_ByteSize(lock_offset),
2557                                            oop_maps);
2558
2559  if (is_critical_native) {
2560    nm->set_lazy_critical_native(true);
2561  }
2562
2563  return nm;
2564#else
2565  ShouldNotReachHere();
2566  return NULL;
2567#endif // COMPILER2
2568}
2569
2570// This function returns the adjust size (in number of words) to a c2i adapter
2571// activation for use during deoptimization.
2572int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals) {
2573  return round_to((callee_locals - callee_parameters) * Interpreter::stackElementWords, frame::alignment_in_bytes);
2574}
2575
2576uint SharedRuntime::out_preserve_stack_slots() {
2577#if defined(COMPILER1) || defined(COMPILER2)
2578  return frame::jit_out_preserve_size / VMRegImpl::stack_slot_size;
2579#else
2580  return 0;
2581#endif
2582}
2583
2584#if defined(COMPILER1) || defined(COMPILER2)
2585// Frame generation for deopt and uncommon trap blobs.
2586static void push_skeleton_frame(MacroAssembler* masm, bool deopt,
2587                                /* Read */
2588                                Register unroll_block_reg,
2589                                /* Update */
2590                                Register frame_sizes_reg,
2591                                Register number_of_frames_reg,
2592                                Register pcs_reg,
2593                                /* Invalidate */
2594                                Register frame_size_reg,
2595                                Register pc_reg) {
2596
2597  __ ld(pc_reg, 0, pcs_reg);
2598  __ ld(frame_size_reg, 0, frame_sizes_reg);
2599  __ std(pc_reg, _abi(lr), R1_SP);
2600  __ push_frame(frame_size_reg, R0/*tmp*/);
2601#ifdef ASSERT
2602  __ load_const_optimized(pc_reg, 0x5afe);
2603  __ std(pc_reg, _ijava_state_neg(ijava_reserved), R1_SP);
2604#endif
2605  __ std(R1_SP, _ijava_state_neg(sender_sp), R1_SP);
2606  __ addi(number_of_frames_reg, number_of_frames_reg, -1);
2607  __ addi(frame_sizes_reg, frame_sizes_reg, wordSize);
2608  __ addi(pcs_reg, pcs_reg, wordSize);
2609}
2610
2611// Loop through the UnrollBlock info and create new frames.
2612static void push_skeleton_frames(MacroAssembler* masm, bool deopt,
2613                                 /* read */
2614                                 Register unroll_block_reg,
2615                                 /* invalidate */
2616                                 Register frame_sizes_reg,
2617                                 Register number_of_frames_reg,
2618                                 Register pcs_reg,
2619                                 Register frame_size_reg,
2620                                 Register pc_reg) {
2621  Label loop;
2622
2623 // _number_of_frames is of type int (deoptimization.hpp)
2624  __ lwa(number_of_frames_reg,
2625             Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes(),
2626             unroll_block_reg);
2627  __ ld(pcs_reg,
2628            Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes(),
2629            unroll_block_reg);
2630  __ ld(frame_sizes_reg,
2631            Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes(),
2632            unroll_block_reg);
2633
2634  // stack: (caller_of_deoptee, ...).
2635
2636  // At this point we either have an interpreter frame or a compiled
2637  // frame on top of stack. If it is a compiled frame we push a new c2i
2638  // adapter here
2639
2640  // Memorize top-frame stack-pointer.
2641  __ mr(frame_size_reg/*old_sp*/, R1_SP);
2642
2643  // Resize interpreter top frame OR C2I adapter.
2644
2645  // At this moment, the top frame (which is the caller of the deoptee) is
2646  // an interpreter frame or a newly pushed C2I adapter or an entry frame.
2647  // The top frame has a TOP_IJAVA_FRAME_ABI and the frame contains the
2648  // outgoing arguments.
2649  //
2650  // In order to push the interpreter frame for the deoptee, we need to
2651  // resize the top frame such that we are able to place the deoptee's
2652  // locals in the frame.
2653  // Additionally, we have to turn the top frame's TOP_IJAVA_FRAME_ABI
2654  // into a valid PARENT_IJAVA_FRAME_ABI.
2655
2656  __ lwa(R11_scratch1,
2657             Deoptimization::UnrollBlock::caller_adjustment_offset_in_bytes(),
2658             unroll_block_reg);
2659  __ neg(R11_scratch1, R11_scratch1);
2660
2661  // R11_scratch1 contains size of locals for frame resizing.
2662  // R12_scratch2 contains top frame's lr.
2663
2664  // Resize frame by complete frame size prevents TOC from being
2665  // overwritten by locals. A more stack space saving way would be
2666  // to copy the TOC to its location in the new abi.
2667  __ addi(R11_scratch1, R11_scratch1, - frame::parent_ijava_frame_abi_size);
2668
2669  // now, resize the frame
2670  __ resize_frame(R11_scratch1, pc_reg/*tmp*/);
2671
2672  // In the case where we have resized a c2i frame above, the optional
2673  // alignment below the locals has size 32 (why?).
2674  __ std(R12_scratch2, _abi(lr), R1_SP);
2675
2676  // Initialize initial_caller_sp.
2677#ifdef ASSERT
2678 __ load_const_optimized(pc_reg, 0x5afe);
2679 __ std(pc_reg, _ijava_state_neg(ijava_reserved), R1_SP);
2680#endif
2681 __ std(frame_size_reg, _ijava_state_neg(sender_sp), R1_SP);
2682
2683#ifdef ASSERT
2684  // Make sure that there is at least one entry in the array.
2685  __ cmpdi(CCR0, number_of_frames_reg, 0);
2686  __ asm_assert_ne("array_size must be > 0", 0x205);
2687#endif
2688
2689  // Now push the new interpreter frames.
2690  //
2691  __ bind(loop);
2692  // Allocate a new frame, fill in the pc.
2693  push_skeleton_frame(masm, deopt,
2694                      unroll_block_reg,
2695                      frame_sizes_reg,
2696                      number_of_frames_reg,
2697                      pcs_reg,
2698                      frame_size_reg,
2699                      pc_reg);
2700  __ cmpdi(CCR0, number_of_frames_reg, 0);
2701  __ bne(CCR0, loop);
2702
2703  // Get the return address pointing into the frame manager.
2704  __ ld(R0, 0, pcs_reg);
2705  // Store it in the top interpreter frame.
2706  __ std(R0, _abi(lr), R1_SP);
2707  // Initialize frame_manager_lr of interpreter top frame.
2708}
2709#endif
2710
2711void SharedRuntime::generate_deopt_blob() {
2712  // Allocate space for the code
2713  ResourceMark rm;
2714  // Setup code generation tools
2715  CodeBuffer buffer("deopt_blob", 2048, 1024);
2716  InterpreterMacroAssembler* masm = new InterpreterMacroAssembler(&buffer);
2717  Label exec_mode_initialized;
2718  int frame_size_in_words;
2719  OopMap* map = NULL;
2720  OopMapSet *oop_maps = new OopMapSet();
2721
2722  // size of ABI112 plus spill slots for R3_RET and F1_RET.
2723  const int frame_size_in_bytes = frame::abi_reg_args_spill_size;
2724  const int frame_size_in_slots = frame_size_in_bytes / sizeof(jint);
2725  int first_frame_size_in_bytes = 0; // frame size of "unpack frame" for call to fetch_unroll_info.
2726
2727  const Register exec_mode_reg = R21_tmp1;
2728
2729  const address start = __ pc();
2730
2731#if defined(COMPILER1) || defined(COMPILER2)
2732  // --------------------------------------------------------------------------
2733  // Prolog for non exception case!
2734
2735  // We have been called from the deopt handler of the deoptee.
2736  //
2737  // deoptee:
2738  //                      ...
2739  //                      call X
2740  //                      ...
2741  //  deopt_handler:      call_deopt_stub
2742  //  cur. return pc  --> ...
2743  //
2744  // So currently SR_LR points behind the call in the deopt handler.
2745  // We adjust it such that it points to the start of the deopt handler.
2746  // The return_pc has been stored in the frame of the deoptee and
2747  // will replace the address of the deopt_handler in the call
2748  // to Deoptimization::fetch_unroll_info below.
2749  // We can't grab a free register here, because all registers may
2750  // contain live values, so let the RegisterSaver do the adjustment
2751  // of the return pc.
2752  const int return_pc_adjustment_no_exception = -HandlerImpl::size_deopt_handler();
2753
2754  // Push the "unpack frame"
2755  // Save everything in sight.
2756  map = RegisterSaver::push_frame_reg_args_and_save_live_registers(masm,
2757                                                                   &first_frame_size_in_bytes,
2758                                                                   /*generate_oop_map=*/ true,
2759                                                                   return_pc_adjustment_no_exception,
2760                                                                   RegisterSaver::return_pc_is_lr);
2761  assert(map != NULL, "OopMap must have been created");
2762
2763  __ li(exec_mode_reg, Deoptimization::Unpack_deopt);
2764  // Save exec mode for unpack_frames.
2765  __ b(exec_mode_initialized);
2766
2767  // --------------------------------------------------------------------------
2768  // Prolog for exception case
2769
2770  // An exception is pending.
2771  // We have been called with a return (interpreter) or a jump (exception blob).
2772  //
2773  // - R3_ARG1: exception oop
2774  // - R4_ARG2: exception pc
2775
2776  int exception_offset = __ pc() - start;
2777
2778  BLOCK_COMMENT("Prolog for exception case");
2779
2780  // Store exception oop and pc in thread (location known to GC).
2781  // This is needed since the call to "fetch_unroll_info()" may safepoint.
2782  __ std(R3_ARG1, in_bytes(JavaThread::exception_oop_offset()), R16_thread);
2783  __ std(R4_ARG2, in_bytes(JavaThread::exception_pc_offset()),  R16_thread);
2784  __ std(R4_ARG2, _abi(lr), R1_SP);
2785
2786  // Vanilla deoptimization with an exception pending in exception_oop.
2787  int exception_in_tls_offset = __ pc() - start;
2788
2789  // Push the "unpack frame".
2790  // Save everything in sight.
2791  RegisterSaver::push_frame_reg_args_and_save_live_registers(masm,
2792                                                             &first_frame_size_in_bytes,
2793                                                             /*generate_oop_map=*/ false,
2794                                                             /*return_pc_adjustment_exception=*/ 0,
2795                                                             RegisterSaver::return_pc_is_pre_saved);
2796
2797  // Deopt during an exception. Save exec mode for unpack_frames.
2798  __ li(exec_mode_reg, Deoptimization::Unpack_exception);
2799
2800  // fall through
2801
2802  int reexecute_offset = 0;
2803#ifdef COMPILER1
2804  __ b(exec_mode_initialized);
2805
2806  // Reexecute entry, similar to c2 uncommon trap
2807  reexecute_offset = __ pc() - start;
2808
2809  RegisterSaver::push_frame_reg_args_and_save_live_registers(masm,
2810                                                             &first_frame_size_in_bytes,
2811                                                             /*generate_oop_map=*/ false,
2812                                                             /*return_pc_adjustment_reexecute=*/ 0,
2813                                                             RegisterSaver::return_pc_is_pre_saved);
2814  __ li(exec_mode_reg, Deoptimization::Unpack_reexecute);
2815#endif
2816
2817  // --------------------------------------------------------------------------
2818  __ BIND(exec_mode_initialized);
2819
2820  {
2821  const Register unroll_block_reg = R22_tmp2;
2822
2823  // We need to set `last_Java_frame' because `fetch_unroll_info' will
2824  // call `last_Java_frame()'. The value of the pc in the frame is not
2825  // particularly important. It just needs to identify this blob.
2826  __ set_last_Java_frame(R1_SP, noreg);
2827
2828  // With EscapeAnalysis turned on, this call may safepoint!
2829  __ call_VM_leaf(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info), R16_thread, exec_mode_reg);
2830  address calls_return_pc = __ last_calls_return_pc();
2831  // Set an oopmap for the call site that describes all our saved registers.
2832  oop_maps->add_gc_map(calls_return_pc - start, map);
2833
2834  __ reset_last_Java_frame();
2835  // Save the return value.
2836  __ mr(unroll_block_reg, R3_RET);
2837
2838  // Restore only the result registers that have been saved
2839  // by save_volatile_registers(...).
2840  RegisterSaver::restore_result_registers(masm, first_frame_size_in_bytes);
2841
2842  // reload the exec mode from the UnrollBlock (it might have changed)
2843  __ lwz(exec_mode_reg, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes(), unroll_block_reg);
2844  // In excp_deopt_mode, restore and clear exception oop which we
2845  // stored in the thread during exception entry above. The exception
2846  // oop will be the return value of this stub.
2847  Label skip_restore_excp;
2848  __ cmpdi(CCR0, exec_mode_reg, Deoptimization::Unpack_exception);
2849  __ bne(CCR0, skip_restore_excp);
2850  __ ld(R3_RET, in_bytes(JavaThread::exception_oop_offset()), R16_thread);
2851  __ ld(R4_ARG2, in_bytes(JavaThread::exception_pc_offset()), R16_thread);
2852  __ li(R0, 0);
2853  __ std(R0, in_bytes(JavaThread::exception_pc_offset()),  R16_thread);
2854  __ std(R0, in_bytes(JavaThread::exception_oop_offset()), R16_thread);
2855  __ BIND(skip_restore_excp);
2856
2857  __ pop_frame();
2858
2859  // stack: (deoptee, optional i2c, caller of deoptee, ...).
2860
2861  // pop the deoptee's frame
2862  __ pop_frame();
2863
2864  // stack: (caller_of_deoptee, ...).
2865
2866  // Loop through the `UnrollBlock' info and create interpreter frames.
2867  push_skeleton_frames(masm, true/*deopt*/,
2868                       unroll_block_reg,
2869                       R23_tmp3,
2870                       R24_tmp4,
2871                       R25_tmp5,
2872                       R26_tmp6,
2873                       R27_tmp7);
2874
2875  // stack: (skeletal interpreter frame, ..., optional skeletal
2876  // interpreter frame, optional c2i, caller of deoptee, ...).
2877  }
2878
2879  // push an `unpack_frame' taking care of float / int return values.
2880  __ push_frame(frame_size_in_bytes, R0/*tmp*/);
2881
2882  // stack: (unpack frame, skeletal interpreter frame, ..., optional
2883  // skeletal interpreter frame, optional c2i, caller of deoptee,
2884  // ...).
2885
2886  // Spill live volatile registers since we'll do a call.
2887  __ std( R3_RET, _abi_reg_args_spill(spill_ret),  R1_SP);
2888  __ stfd(F1_RET, _abi_reg_args_spill(spill_fret), R1_SP);
2889
2890  // Let the unpacker layout information in the skeletal frames just
2891  // allocated.
2892  __ get_PC_trash_LR(R3_RET);
2893  __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R3_RET);
2894  // This is a call to a LEAF method, so no oop map is required.
2895  __ call_VM_leaf(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames),
2896                  R16_thread/*thread*/, exec_mode_reg/*exec_mode*/);
2897  __ reset_last_Java_frame();
2898
2899  // Restore the volatiles saved above.
2900  __ ld( R3_RET, _abi_reg_args_spill(spill_ret),  R1_SP);
2901  __ lfd(F1_RET, _abi_reg_args_spill(spill_fret), R1_SP);
2902
2903  // Pop the unpack frame.
2904  __ pop_frame();
2905  __ restore_LR_CR(R0);
2906
2907  // stack: (top interpreter frame, ..., optional interpreter frame,
2908  // optional c2i, caller of deoptee, ...).
2909
2910  // Initialize R14_state.
2911  __ restore_interpreter_state(R11_scratch1);
2912  __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R11_scratch1);
2913
2914  // Return to the interpreter entry point.
2915  __ blr();
2916  __ flush();
2917#else // COMPILER2
2918  __ unimplemented("deopt blob needed only with compiler");
2919  int exception_offset = __ pc() - start;
2920#endif // COMPILER2
2921
2922  _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset,
2923                                           reexecute_offset, first_frame_size_in_bytes / wordSize);
2924  _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2925}
2926
2927#ifdef COMPILER2
2928void SharedRuntime::generate_uncommon_trap_blob() {
2929  // Allocate space for the code.
2930  ResourceMark rm;
2931  // Setup code generation tools.
2932  CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2933  InterpreterMacroAssembler* masm = new InterpreterMacroAssembler(&buffer);
2934  address start = __ pc();
2935
2936  Register unroll_block_reg = R21_tmp1;
2937  Register klass_index_reg  = R22_tmp2;
2938  Register unc_trap_reg     = R23_tmp3;
2939
2940  OopMapSet* oop_maps = new OopMapSet();
2941  int frame_size_in_bytes = frame::abi_reg_args_size;
2942  OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);
2943
2944  // stack: (deoptee, optional i2c, caller_of_deoptee, ...).
2945
2946  // Push a dummy `unpack_frame' and call
2947  // `Deoptimization::uncommon_trap' to pack the compiled frame into a
2948  // vframe array and return the `UnrollBlock' information.
2949
2950  // Save LR to compiled frame.
2951  __ save_LR_CR(R11_scratch1);
2952
2953  // Push an "uncommon_trap" frame.
2954  __ push_frame_reg_args(0, R11_scratch1);
2955
2956  // stack: (unpack frame, deoptee, optional i2c, caller_of_deoptee, ...).
2957
2958  // Set the `unpack_frame' as last_Java_frame.
2959  // `Deoptimization::uncommon_trap' expects it and considers its
2960  // sender frame as the deoptee frame.
2961  // Remember the offset of the instruction whose address will be
2962  // moved to R11_scratch1.
2963  address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);
2964
2965  __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);
2966
2967  __ mr(klass_index_reg, R3);
2968  __ li(R5_ARG3, Deoptimization::Unpack_uncommon_trap);
2969  __ call_VM_leaf(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap),
2970                  R16_thread, klass_index_reg, R5_ARG3);
2971
2972  // Set an oopmap for the call site.
2973  oop_maps->add_gc_map(gc_map_pc - start, map);
2974
2975  __ reset_last_Java_frame();
2976
2977  // Pop the `unpack frame'.
2978  __ pop_frame();
2979
2980  // stack: (deoptee, optional i2c, caller_of_deoptee, ...).
2981
2982  // Save the return value.
2983  __ mr(unroll_block_reg, R3_RET);
2984
2985  // Pop the uncommon_trap frame.
2986  __ pop_frame();
2987
2988  // stack: (caller_of_deoptee, ...).
2989
2990#ifdef ASSERT
2991  __ lwz(R22_tmp2, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes(), unroll_block_reg);
2992  __ cmpdi(CCR0, R22_tmp2, (unsigned)Deoptimization::Unpack_uncommon_trap);
2993  __ asm_assert_eq("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap", 0);
2994#endif
2995
2996  // Allocate new interpreter frame(s) and possibly a c2i adapter
2997  // frame.
2998  push_skeleton_frames(masm, false/*deopt*/,
2999                       unroll_block_reg,
3000                       R22_tmp2,
3001                       R23_tmp3,
3002                       R24_tmp4,
3003                       R25_tmp5,
3004                       R26_tmp6);
3005
3006  // stack: (skeletal interpreter frame, ..., optional skeletal
3007  // interpreter frame, optional c2i, caller of deoptee, ...).
3008
3009  // Push a dummy `unpack_frame' taking care of float return values.
3010  // Call `Deoptimization::unpack_frames' to layout information in the
3011  // interpreter frames just created.
3012
3013  // Push a simple "unpack frame" here.
3014  __ push_frame_reg_args(0, R11_scratch1);
3015
3016  // stack: (unpack frame, skeletal interpreter frame, ..., optional
3017  // skeletal interpreter frame, optional c2i, caller of deoptee,
3018  // ...).
3019
3020  // Set the "unpack_frame" as last_Java_frame.
3021  __ get_PC_trash_LR(R11_scratch1);
3022  __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);
3023
3024  // Indicate it is the uncommon trap case.
3025  __ li(unc_trap_reg, Deoptimization::Unpack_uncommon_trap);
3026  // Let the unpacker layout information in the skeletal frames just
3027  // allocated.
3028  __ call_VM_leaf(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames),
3029                  R16_thread, unc_trap_reg);
3030
3031  __ reset_last_Java_frame();
3032  // Pop the `unpack frame'.
3033  __ pop_frame();
3034  // Restore LR from top interpreter frame.
3035  __ restore_LR_CR(R11_scratch1);
3036
3037  // stack: (top interpreter frame, ..., optional interpreter frame,
3038  // optional c2i, caller of deoptee, ...).
3039
3040  __ restore_interpreter_state(R11_scratch1);
3041  __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R11_scratch1);
3042
3043  // Return to the interpreter entry point.
3044  __ blr();
3045
3046  masm->flush();
3047
3048  _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, frame_size_in_bytes/wordSize);
3049}
3050#endif // COMPILER2
3051
3052// Generate a special Compile2Runtime blob that saves all registers, and setup oopmap.
3053SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3054  assert(StubRoutines::forward_exception_entry() != NULL,
3055         "must be generated before");
3056
3057  ResourceMark rm;
3058  OopMapSet *oop_maps = new OopMapSet();
3059  OopMap* map;
3060
3061  // Allocate space for the code. Setup code generation tools.
3062  CodeBuffer buffer("handler_blob", 2048, 1024);
3063  MacroAssembler* masm = new MacroAssembler(&buffer);
3064
3065  address start = __ pc();
3066  int frame_size_in_bytes = 0;
3067
3068  RegisterSaver::ReturnPCLocation return_pc_location;
3069  bool cause_return = (poll_type == POLL_AT_RETURN);
3070  if (cause_return) {
3071    // Nothing to do here. The frame has already been popped in MachEpilogNode.
3072    // Register LR already contains the return pc.
3073    return_pc_location = RegisterSaver::return_pc_is_lr;
3074  } else {
3075    // Use thread()->saved_exception_pc() as return pc.
3076    return_pc_location = RegisterSaver::return_pc_is_thread_saved_exception_pc;
3077  }
3078
3079  // Save registers, fpu state, and flags.
3080  map = RegisterSaver::push_frame_reg_args_and_save_live_registers(masm,
3081                                                                   &frame_size_in_bytes,
3082                                                                   /*generate_oop_map=*/ true,
3083                                                                   /*return_pc_adjustment=*/0,
3084                                                                   return_pc_location);
3085
3086  // The following is basically a call_VM. However, we need the precise
3087  // address of the call in order to generate an oopmap. Hence, we do all the
3088  // work outselves.
3089  __ set_last_Java_frame(/*sp=*/R1_SP, /*pc=*/noreg);
3090
3091  // The return address must always be correct so that the frame constructor
3092  // never sees an invalid pc.
3093
3094  // Do the call
3095  __ call_VM_leaf(call_ptr, R16_thread);
3096  address calls_return_pc = __ last_calls_return_pc();
3097
3098  // Set an oopmap for the call site. This oopmap will map all
3099  // oop-registers and debug-info registers as callee-saved. This
3100  // will allow deoptimization at this safepoint to find all possible
3101  // debug-info recordings, as well as let GC find all oops.
3102  oop_maps->add_gc_map(calls_return_pc - start, map);
3103
3104  Label noException;
3105
3106  // Clear the last Java frame.
3107  __ reset_last_Java_frame();
3108
3109  BLOCK_COMMENT("  Check pending exception.");
3110  const Register pending_exception = R0;
3111  __ ld(pending_exception, thread_(pending_exception));
3112  __ cmpdi(CCR0, pending_exception, 0);
3113  __ beq(CCR0, noException);
3114
3115  // Exception pending
3116  RegisterSaver::restore_live_registers_and_pop_frame(masm,
3117                                                      frame_size_in_bytes,
3118                                                      /*restore_ctr=*/true);
3119
3120  BLOCK_COMMENT("  Jump to forward_exception_entry.");
3121  // Jump to forward_exception_entry, with the issuing PC in LR
3122  // so it looks like the original nmethod called forward_exception_entry.
3123  __ b64_patchable(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
3124
3125  // No exception case.
3126  __ BIND(noException);
3127
3128
3129  // Normal exit, restore registers and exit.
3130  RegisterSaver::restore_live_registers_and_pop_frame(masm,
3131                                                      frame_size_in_bytes,
3132                                                      /*restore_ctr=*/true);
3133
3134  __ blr();
3135
3136  // Make sure all code is generated
3137  masm->flush();
3138
3139  // Fill-out other meta info
3140  // CodeBlob frame size is in words.
3141  return SafepointBlob::create(&buffer, oop_maps, frame_size_in_bytes / wordSize);
3142}
3143
3144// generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss)
3145//
3146// Generate a stub that calls into the vm to find out the proper destination
3147// of a java call. All the argument registers are live at this point
3148// but since this is generic code we don't know what they are and the caller
3149// must do any gc of the args.
3150//
3151RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3152
3153  // allocate space for the code
3154  ResourceMark rm;
3155
3156  CodeBuffer buffer(name, 1000, 512);
3157  MacroAssembler* masm = new MacroAssembler(&buffer);
3158
3159  int frame_size_in_bytes;
3160
3161  OopMapSet *oop_maps = new OopMapSet();
3162  OopMap* map = NULL;
3163
3164  address start = __ pc();
3165
3166  map = RegisterSaver::push_frame_reg_args_and_save_live_registers(masm,
3167                                                                   &frame_size_in_bytes,
3168                                                                   /*generate_oop_map*/ true,
3169                                                                   /*return_pc_adjustment*/ 0,
3170                                                                   RegisterSaver::return_pc_is_lr);
3171
3172  // Use noreg as last_Java_pc, the return pc will be reconstructed
3173  // from the physical frame.
3174  __ set_last_Java_frame(/*sp*/R1_SP, noreg);
3175
3176  int frame_complete = __ offset();
3177
3178  // Pass R19_method as 2nd (optional) argument, used by
3179  // counter_overflow_stub.
3180  __ call_VM_leaf(destination, R16_thread, R19_method);
3181  address calls_return_pc = __ last_calls_return_pc();
3182  // Set an oopmap for the call site.
3183  // We need this not only for callee-saved registers, but also for volatile
3184  // registers that the compiler might be keeping live across a safepoint.
3185  // Create the oopmap for the call's return pc.
3186  oop_maps->add_gc_map(calls_return_pc - start, map);
3187
3188  // R3_RET contains the address we are going to jump to assuming no exception got installed.
3189
3190  // clear last_Java_sp
3191  __ reset_last_Java_frame();
3192
3193  // Check for pending exceptions.
3194  BLOCK_COMMENT("Check for pending exceptions.");
3195  Label pending;
3196  __ ld(R11_scratch1, thread_(pending_exception));
3197  __ cmpdi(CCR0, R11_scratch1, 0);
3198  __ bne(CCR0, pending);
3199
3200  __ mtctr(R3_RET); // Ctr will not be touched by restore_live_registers_and_pop_frame.
3201
3202  RegisterSaver::restore_live_registers_and_pop_frame(masm, frame_size_in_bytes, /*restore_ctr*/ false);
3203
3204  // Get the returned method.
3205  __ get_vm_result_2(R19_method);
3206
3207  __ bctr();
3208
3209
3210  // Pending exception after the safepoint.
3211  __ BIND(pending);
3212
3213  RegisterSaver::restore_live_registers_and_pop_frame(masm, frame_size_in_bytes, /*restore_ctr*/ true);
3214
3215  // exception pending => remove activation and forward to exception handler
3216
3217  __ li(R11_scratch1, 0);
3218  __ ld(R3_ARG1, thread_(pending_exception));
3219  __ std(R11_scratch1, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3220  __ b64_patchable(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
3221
3222  // -------------
3223  // Make sure all code is generated.
3224  masm->flush();
3225
3226  // return the blob
3227  // frame_size_words or bytes??
3228  return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_bytes/wordSize,
3229                                       oop_maps, true);
3230}
3231
3232
3233//------------------------------Montgomery multiplication------------------------
3234//
3235
3236// Subtract 0:b from carry:a. Return carry.
3237static unsigned long
3238sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
3239  long i = 0;
3240  unsigned long tmp, tmp2;
3241  __asm__ __volatile__ (
3242    "subfc  %[tmp], %[tmp], %[tmp]   \n" // pre-set CA
3243    "mtctr  %[len]                   \n"
3244    "0:                              \n"
3245    "ldx    %[tmp], %[i], %[a]       \n"
3246    "ldx    %[tmp2], %[i], %[b]      \n"
3247    "subfe  %[tmp], %[tmp2], %[tmp]  \n" // subtract extended
3248    "stdx   %[tmp], %[i], %[a]       \n"
3249    "addi   %[i], %[i], 8            \n"
3250    "bdnz   0b                       \n"
3251    "addme  %[tmp], %[carry]         \n" // carry + CA - 1
3252    : [i]"+b"(i), [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2)
3253    : [a]"r"(a), [b]"r"(b), [carry]"r"(carry), [len]"r"(len)
3254    : "ctr", "xer", "memory"
3255  );
3256  return tmp;
3257}
3258
3259// Multiply (unsigned) Long A by Long B, accumulating the double-
3260// length result into the accumulator formed of T0, T1, and T2.
3261inline void MACC(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) {
3262  unsigned long hi, lo;
3263  __asm__ __volatile__ (
3264    "mulld  %[lo], %[A], %[B]    \n"
3265    "mulhdu %[hi], %[A], %[B]    \n"
3266    "addc   %[T0], %[T0], %[lo]  \n"
3267    "adde   %[T1], %[T1], %[hi]  \n"
3268    "addze  %[T2], %[T2]         \n"
3269    : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2)
3270    : [A]"r"(A), [B]"r"(B)
3271    : "xer"
3272  );
3273}
3274
3275// As above, but add twice the double-length result into the
3276// accumulator.
3277inline void MACC2(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) {
3278  unsigned long hi, lo;
3279  __asm__ __volatile__ (
3280    "mulld  %[lo], %[A], %[B]    \n"
3281    "mulhdu %[hi], %[A], %[B]    \n"
3282    "addc   %[T0], %[T0], %[lo]  \n"
3283    "adde   %[T1], %[T1], %[hi]  \n"
3284    "addze  %[T2], %[T2]         \n"
3285    "addc   %[T0], %[T0], %[lo]  \n"
3286    "adde   %[T1], %[T1], %[hi]  \n"
3287    "addze  %[T2], %[T2]         \n"
3288    : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2)
3289    : [A]"r"(A), [B]"r"(B)
3290    : "xer"
3291  );
3292}
3293
3294// Fast Montgomery multiplication. The derivation of the algorithm is
3295// in "A Cryptographic Library for the Motorola DSP56000,
3296// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237".
3297static void
3298montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
3299                    unsigned long m[], unsigned long inv, int len) {
3300  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3301  int i;
3302
3303  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3304
3305  for (i = 0; i < len; i++) {
3306    int j;
3307    for (j = 0; j < i; j++) {
3308      MACC(a[j], b[i-j], t0, t1, t2);
3309      MACC(m[j], n[i-j], t0, t1, t2);
3310    }
3311    MACC(a[i], b[0], t0, t1, t2);
3312    m[i] = t0 * inv;
3313    MACC(m[i], n[0], t0, t1, t2);
3314
3315    assert(t0 == 0, "broken Montgomery multiply");
3316
3317    t0 = t1; t1 = t2; t2 = 0;
3318  }
3319
3320  for (i = len; i < 2*len; i++) {
3321    int j;
3322    for (j = i-len+1; j < len; j++) {
3323      MACC(a[j], b[i-j], t0, t1, t2);
3324      MACC(m[j], n[i-j], t0, t1, t2);
3325    }
3326    m[i-len] = t0;
3327    t0 = t1; t1 = t2; t2 = 0;
3328  }
3329
3330  while (t0) {
3331    t0 = sub(m, n, t0, len);
3332  }
3333}
3334
3335// Fast Montgomery squaring. This uses asymptotically 25% fewer
3336// multiplies so it should be up to 25% faster than Montgomery
3337// multiplication. However, its loop control is more complex and it
3338// may actually run slower on some machines.
3339static void
3340montgomery_square(unsigned long a[], unsigned long n[],
3341                  unsigned long m[], unsigned long inv, int len) {
3342  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3343  int i;
3344
3345  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3346
3347  for (i = 0; i < len; i++) {
3348    int j;
3349    int end = (i+1)/2;
3350    for (j = 0; j < end; j++) {
3351      MACC2(a[j], a[i-j], t0, t1, t2);
3352      MACC(m[j], n[i-j], t0, t1, t2);
3353    }
3354    if ((i & 1) == 0) {
3355      MACC(a[j], a[j], t0, t1, t2);
3356    }
3357    for (; j < i; j++) {
3358      MACC(m[j], n[i-j], t0, t1, t2);
3359    }
3360    m[i] = t0 * inv;
3361    MACC(m[i], n[0], t0, t1, t2);
3362
3363    assert(t0 == 0, "broken Montgomery square");
3364
3365    t0 = t1; t1 = t2; t2 = 0;
3366  }
3367
3368  for (i = len; i < 2*len; i++) {
3369    int start = i-len+1;
3370    int end = start + (len - start)/2;
3371    int j;
3372    for (j = start; j < end; j++) {
3373      MACC2(a[j], a[i-j], t0, t1, t2);
3374      MACC(m[j], n[i-j], t0, t1, t2);
3375    }
3376    if ((i & 1) == 0) {
3377      MACC(a[j], a[j], t0, t1, t2);
3378    }
3379    for (; j < len; j++) {
3380      MACC(m[j], n[i-j], t0, t1, t2);
3381    }
3382    m[i-len] = t0;
3383    t0 = t1; t1 = t2; t2 = 0;
3384  }
3385
3386  while (t0) {
3387    t0 = sub(m, n, t0, len);
3388  }
3389}
3390
3391// The threshold at which squaring is advantageous was determined
3392// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3393// Doesn't seem to be relevant for Power8 so we use the same value.
3394#define MONTGOMERY_SQUARING_THRESHOLD 64
3395
3396// Copy len longwords from s to d, word-swapping as we go. The
3397// destination array is reversed.
3398static void reverse_words(unsigned long *s, unsigned long *d, int len) {
3399  d += len;
3400  while(len-- > 0) {
3401    d--;
3402    unsigned long s_val = *s;
3403    // Swap words in a longword on little endian machines.
3404#ifdef VM_LITTLE_ENDIAN
3405     s_val = (s_val << 32) | (s_val >> 32);
3406#endif
3407    *d = s_val;
3408    s++;
3409  }
3410}
3411
3412void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3413                                        jint len, jlong inv,
3414                                        jint *m_ints) {
3415  len = len & 0x7fffFFFF; // C2 does not respect int to long conversion for stub calls.
3416  assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3417  int longwords = len/2;
3418
3419  // Make very sure we don't use so much space that the stack might
3420  // overflow. 512 jints corresponds to an 16384-bit integer and
3421  // will use here a total of 8k bytes of stack space.
3422  int total_allocation = longwords * sizeof (unsigned long) * 4;
3423  guarantee(total_allocation <= 8192, "must be");
3424  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
3425
3426  // Local scratch arrays
3427  unsigned long
3428    *a = scratch + 0 * longwords,
3429    *b = scratch + 1 * longwords,
3430    *n = scratch + 2 * longwords,
3431    *m = scratch + 3 * longwords;
3432
3433  reverse_words((unsigned long *)a_ints, a, longwords);
3434  reverse_words((unsigned long *)b_ints, b, longwords);
3435  reverse_words((unsigned long *)n_ints, n, longwords);
3436
3437  ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
3438
3439  reverse_words(m, (unsigned long *)m_ints, longwords);
3440}
3441
3442void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3443                                      jint len, jlong inv,
3444                                      jint *m_ints) {
3445  len = len & 0x7fffFFFF; // C2 does not respect int to long conversion for stub calls.
3446  assert(len % 2 == 0, "array length in montgomery_square must be even");
3447  int longwords = len/2;
3448
3449  // Make very sure we don't use so much space that the stack might
3450  // overflow. 512 jints corresponds to an 16384-bit integer and
3451  // will use here a total of 6k bytes of stack space.
3452  int total_allocation = longwords * sizeof (unsigned long) * 3;
3453  guarantee(total_allocation <= 8192, "must be");
3454  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
3455
3456  // Local scratch arrays
3457  unsigned long
3458    *a = scratch + 0 * longwords,
3459    *n = scratch + 1 * longwords,
3460    *m = scratch + 2 * longwords;
3461
3462  reverse_words((unsigned long *)a_ints, a, longwords);
3463  reverse_words((unsigned long *)n_ints, n, longwords);
3464
3465  if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3466    ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
3467  } else {
3468    ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
3469  }
3470
3471  reverse_words(m, (unsigned long *)m_ints, longwords);
3472}
3473