macroAssembler_ppc.cpp revision 9898:2794bc7859f5
1/*
2 * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
3 * Copyright 2012, 2015 SAP AG. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26#include "precompiled.hpp"
27#include "asm/macroAssembler.inline.hpp"
28#include "compiler/disassembler.hpp"
29#include "gc/shared/cardTableModRefBS.hpp"
30#include "gc/shared/collectedHeap.inline.hpp"
31#include "interpreter/interpreter.hpp"
32#include "memory/resourceArea.hpp"
33#include "nativeInst_ppc.hpp"
34#include "prims/methodHandles.hpp"
35#include "runtime/biasedLocking.hpp"
36#include "runtime/icache.hpp"
37#include "runtime/interfaceSupport.hpp"
38#include "runtime/objectMonitor.hpp"
39#include "runtime/os.hpp"
40#include "runtime/sharedRuntime.hpp"
41#include "runtime/stubRoutines.hpp"
42#include "utilities/macros.hpp"
43#if INCLUDE_ALL_GCS
44#include "gc/g1/g1CollectedHeap.inline.hpp"
45#include "gc/g1/g1SATBCardTableModRefBS.hpp"
46#include "gc/g1/heapRegion.hpp"
47#endif // INCLUDE_ALL_GCS
48
49#ifdef PRODUCT
50#define BLOCK_COMMENT(str) // nothing
51#else
52#define BLOCK_COMMENT(str) block_comment(str)
53#endif
54#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
55
56#ifdef ASSERT
57// On RISC, there's no benefit to verifying instruction boundaries.
58bool AbstractAssembler::pd_check_instruction_mark() { return false; }
59#endif
60
61void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
62  assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
63  if (Assembler::is_simm(si31, 16)) {
64    ld(d, si31, a);
65    if (emit_filler_nop) nop();
66  } else {
67    const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
68    const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
69    addis(d, a, hi);
70    ld(d, lo, d);
71  }
72}
73
74void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
75  assert_different_registers(d, a);
76  ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
77}
78
79void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
80                                      size_t size_in_bytes, bool is_signed) {
81  switch (size_in_bytes) {
82  case  8:              ld(dst, offs, base);                         break;
83  case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
84  case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
85  case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
86  default:  ShouldNotReachHere();
87  }
88}
89
90void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
91                                       size_t size_in_bytes) {
92  switch (size_in_bytes) {
93  case  8:  std(dst, offs, base); break;
94  case  4:  stw(dst, offs, base); break;
95  case  2:  sth(dst, offs, base); break;
96  case  1:  stb(dst, offs, base); break;
97  default:  ShouldNotReachHere();
98  }
99}
100
101void MacroAssembler::align(int modulus, int max, int rem) {
102  int padding = (rem + modulus - (offset() % modulus)) % modulus;
103  if (padding > max) return;
104  for (int c = (padding >> 2); c > 0; --c) { nop(); }
105}
106
107// Issue instructions that calculate given TOC from global TOC.
108void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
109                                                       bool add_relocation, bool emit_dummy_addr) {
110  int offset = -1;
111  if (emit_dummy_addr) {
112    offset = -128; // dummy address
113  } else if (addr != (address)(intptr_t)-1) {
114    offset = MacroAssembler::offset_to_global_toc(addr);
115  }
116
117  if (hi16) {
118    addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
119  }
120  if (lo16) {
121    if (add_relocation) {
122      // Relocate at the addi to avoid confusion with a load from the method's TOC.
123      relocate(internal_word_Relocation::spec(addr));
124    }
125    addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
126  }
127}
128
129int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
130  const int offset = MacroAssembler::offset_to_global_toc(addr);
131
132  const address inst2_addr = a;
133  const int inst2 = *(int *)inst2_addr;
134
135  // The relocation points to the second instruction, the addi,
136  // and the addi reads and writes the same register dst.
137  const int dst = inv_rt_field(inst2);
138  assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
139
140  // Now, find the preceding addis which writes to dst.
141  int inst1 = 0;
142  address inst1_addr = inst2_addr - BytesPerInstWord;
143  while (inst1_addr >= bound) {
144    inst1 = *(int *) inst1_addr;
145    if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
146      // Stop, found the addis which writes dst.
147      break;
148    }
149    inst1_addr -= BytesPerInstWord;
150  }
151
152  assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
153  set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
154  set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
155  return (int)((intptr_t)addr - (intptr_t)inst1_addr);
156}
157
158address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
159  const address inst2_addr = a;
160  const int inst2 = *(int *)inst2_addr;
161
162  // The relocation points to the second instruction, the addi,
163  // and the addi reads and writes the same register dst.
164  const int dst = inv_rt_field(inst2);
165  assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
166
167  // Now, find the preceding addis which writes to dst.
168  int inst1 = 0;
169  address inst1_addr = inst2_addr - BytesPerInstWord;
170  while (inst1_addr >= bound) {
171    inst1 = *(int *) inst1_addr;
172    if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
173      // stop, found the addis which writes dst
174      break;
175    }
176    inst1_addr -= BytesPerInstWord;
177  }
178
179  assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
180
181  int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
182  // -1 is a special case
183  if (offset == -1) {
184    return (address)(intptr_t)-1;
185  } else {
186    return global_toc() + offset;
187  }
188}
189
190#ifdef _LP64
191// Patch compressed oops or klass constants.
192// Assembler sequence is
193// 1) compressed oops:
194//    lis  rx = const.hi
195//    ori rx = rx | const.lo
196// 2) compressed klass:
197//    lis  rx = const.hi
198//    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
199//    ori rx = rx | const.lo
200// Clrldi will be passed by.
201int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
202  assert(UseCompressedOops, "Should only patch compressed oops");
203
204  const address inst2_addr = a;
205  const int inst2 = *(int *)inst2_addr;
206
207  // The relocation points to the second instruction, the ori,
208  // and the ori reads and writes the same register dst.
209  const int dst = inv_rta_field(inst2);
210  assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
211  // Now, find the preceding addis which writes to dst.
212  int inst1 = 0;
213  address inst1_addr = inst2_addr - BytesPerInstWord;
214  bool inst1_found = false;
215  while (inst1_addr >= bound) {
216    inst1 = *(int *)inst1_addr;
217    if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
218    inst1_addr -= BytesPerInstWord;
219  }
220  assert(inst1_found, "inst is not lis");
221
222  int xc = (data >> 16) & 0xffff;
223  int xd = (data >>  0) & 0xffff;
224
225  set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
226  set_imm((int *)inst2_addr,        (xd)); // unsigned int
227  return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
228}
229
230// Get compressed oop or klass constant.
231narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
232  assert(UseCompressedOops, "Should only patch compressed oops");
233
234  const address inst2_addr = a;
235  const int inst2 = *(int *)inst2_addr;
236
237  // The relocation points to the second instruction, the ori,
238  // and the ori reads and writes the same register dst.
239  const int dst = inv_rta_field(inst2);
240  assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
241  // Now, find the preceding lis which writes to dst.
242  int inst1 = 0;
243  address inst1_addr = inst2_addr - BytesPerInstWord;
244  bool inst1_found = false;
245
246  while (inst1_addr >= bound) {
247    inst1 = *(int *) inst1_addr;
248    if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
249    inst1_addr -= BytesPerInstWord;
250  }
251  assert(inst1_found, "inst is not lis");
252
253  uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
254  uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
255
256  return (int) (xl | xh);
257}
258#endif // _LP64
259
260// Returns true if successful.
261bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
262                                                Register toc, bool fixed_size) {
263  int toc_offset = 0;
264  // Use RelocationHolder::none for the constant pool entry, otherwise
265  // we will end up with a failing NativeCall::verify(x) where x is
266  // the address of the constant pool entry.
267  // FIXME: We should insert relocation information for oops at the constant
268  // pool entries instead of inserting it at the loads; patching of a constant
269  // pool entry should be less expensive.
270  address const_address = address_constant((address)a.value(), RelocationHolder::none);
271  if (const_address == NULL) { return false; } // allocation failure
272  // Relocate at the pc of the load.
273  relocate(a.rspec());
274  toc_offset = (int)(const_address - code()->consts()->start());
275  ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
276  return true;
277}
278
279bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
280  const address inst1_addr = a;
281  const int inst1 = *(int *)inst1_addr;
282
283   // The relocation points to the ld or the addis.
284   return (is_ld(inst1)) ||
285          (is_addis(inst1) && inv_ra_field(inst1) != 0);
286}
287
288int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
289  assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
290
291  const address inst1_addr = a;
292  const int inst1 = *(int *)inst1_addr;
293
294  if (is_ld(inst1)) {
295    return inv_d1_field(inst1);
296  } else if (is_addis(inst1)) {
297    const int dst = inv_rt_field(inst1);
298
299    // Now, find the succeeding ld which reads and writes to dst.
300    address inst2_addr = inst1_addr + BytesPerInstWord;
301    int inst2 = 0;
302    while (true) {
303      inst2 = *(int *) inst2_addr;
304      if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
305        // Stop, found the ld which reads and writes dst.
306        break;
307      }
308      inst2_addr += BytesPerInstWord;
309    }
310    return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
311  }
312  ShouldNotReachHere();
313  return 0;
314}
315
316// Get the constant from a `load_const' sequence.
317long MacroAssembler::get_const(address a) {
318  assert(is_load_const_at(a), "not a load of a constant");
319  const int *p = (const int*) a;
320  unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
321  if (is_ori(*(p+1))) {
322    x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
323    x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
324    x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
325  } else if (is_lis(*(p+1))) {
326    x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
327    x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
328    x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
329  } else {
330    ShouldNotReachHere();
331    return (long) 0;
332  }
333  return (long) x;
334}
335
336// Patch the 64 bit constant of a `load_const' sequence. This is a low
337// level procedure. It neither flushes the instruction cache nor is it
338// mt safe.
339void MacroAssembler::patch_const(address a, long x) {
340  assert(is_load_const_at(a), "not a load of a constant");
341  int *p = (int*) a;
342  if (is_ori(*(p+1))) {
343    set_imm(0 + p, (x >> 48) & 0xffff);
344    set_imm(1 + p, (x >> 32) & 0xffff);
345    set_imm(3 + p, (x >> 16) & 0xffff);
346    set_imm(4 + p, x & 0xffff);
347  } else if (is_lis(*(p+1))) {
348    set_imm(0 + p, (x >> 48) & 0xffff);
349    set_imm(2 + p, (x >> 32) & 0xffff);
350    set_imm(1 + p, (x >> 16) & 0xffff);
351    set_imm(3 + p, x & 0xffff);
352  } else {
353    ShouldNotReachHere();
354  }
355}
356
357AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
358  assert(oop_recorder() != NULL, "this assembler needs a Recorder");
359  int index = oop_recorder()->allocate_metadata_index(obj);
360  RelocationHolder rspec = metadata_Relocation::spec(index);
361  return AddressLiteral((address)obj, rspec);
362}
363
364AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
365  assert(oop_recorder() != NULL, "this assembler needs a Recorder");
366  int index = oop_recorder()->find_index(obj);
367  RelocationHolder rspec = metadata_Relocation::spec(index);
368  return AddressLiteral((address)obj, rspec);
369}
370
371AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
372  assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
373  int oop_index = oop_recorder()->allocate_oop_index(obj);
374  return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
375}
376
377AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
378  assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
379  int oop_index = oop_recorder()->find_index(obj);
380  return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
381}
382
383RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
384                                                      Register tmp, int offset) {
385  intptr_t value = *delayed_value_addr;
386  if (value != 0) {
387    return RegisterOrConstant(value + offset);
388  }
389
390  // Load indirectly to solve generation ordering problem.
391  // static address, no relocation
392  int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
393  ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
394
395  if (offset != 0) {
396    addi(tmp, tmp, offset);
397  }
398
399  return RegisterOrConstant(tmp);
400}
401
402#ifndef PRODUCT
403void MacroAssembler::pd_print_patched_instruction(address branch) {
404  Unimplemented(); // TODO: PPC port
405}
406#endif // ndef PRODUCT
407
408// Conditional far branch for destinations encodable in 24+2 bits.
409void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
410
411  // If requested by flag optimize, relocate the bc_far as a
412  // runtime_call and prepare for optimizing it when the code gets
413  // relocated.
414  if (optimize == bc_far_optimize_on_relocate) {
415    relocate(relocInfo::runtime_call_type);
416  }
417
418  // variant 2:
419  //
420  //    b!cxx SKIP
421  //    bxx   DEST
422  //  SKIP:
423  //
424
425  const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
426                                                opposite_bcond(inv_boint_bcond(boint)));
427
428  // We emit two branches.
429  // First, a conditional branch which jumps around the far branch.
430  const address not_taken_pc = pc() + 2 * BytesPerInstWord;
431  const address bc_pc        = pc();
432  bc(opposite_boint, biint, not_taken_pc);
433
434  const int bc_instr = *(int*)bc_pc;
435  assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
436  assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
437  assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
438                                     opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
439         "postcondition");
440  assert(biint == inv_bi_field(bc_instr), "postcondition");
441
442  // Second, an unconditional far branch which jumps to dest.
443  // Note: target(dest) remembers the current pc (see CodeSection::target)
444  //       and returns the current pc if the label is not bound yet; when
445  //       the label gets bound, the unconditional far branch will be patched.
446  const address target_pc = target(dest);
447  const address b_pc  = pc();
448  b(target_pc);
449
450  assert(not_taken_pc == pc(),                     "postcondition");
451  assert(dest.is_bound() || target_pc == b_pc, "postcondition");
452}
453
454// 1 or 2 instructions
455void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
456  if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
457    bc(boint, biint, dest);
458  } else {
459    bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
460  }
461}
462
463bool MacroAssembler::is_bc_far_at(address instruction_addr) {
464  return is_bc_far_variant1_at(instruction_addr) ||
465         is_bc_far_variant2_at(instruction_addr) ||
466         is_bc_far_variant3_at(instruction_addr);
467}
468
469address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
470  if (is_bc_far_variant1_at(instruction_addr)) {
471    const address instruction_1_addr = instruction_addr;
472    const int instruction_1 = *(int*)instruction_1_addr;
473    return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
474  } else if (is_bc_far_variant2_at(instruction_addr)) {
475    const address instruction_2_addr = instruction_addr + 4;
476    return bxx_destination(instruction_2_addr);
477  } else if (is_bc_far_variant3_at(instruction_addr)) {
478    return instruction_addr + 8;
479  }
480  // variant 4 ???
481  ShouldNotReachHere();
482  return NULL;
483}
484void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
485
486  if (is_bc_far_variant3_at(instruction_addr)) {
487    // variant 3, far cond branch to the next instruction, already patched to nops:
488    //
489    //    nop
490    //    endgroup
491    //  SKIP/DEST:
492    //
493    return;
494  }
495
496  // first, extract boint and biint from the current branch
497  int boint = 0;
498  int biint = 0;
499
500  ResourceMark rm;
501  const int code_size = 2 * BytesPerInstWord;
502  CodeBuffer buf(instruction_addr, code_size);
503  MacroAssembler masm(&buf);
504  if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
505    // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
506    masm.nop();
507    masm.endgroup();
508  } else {
509    if (is_bc_far_variant1_at(instruction_addr)) {
510      // variant 1, the 1st instruction contains the destination address:
511      //
512      //    bcxx  DEST
513      //    nop
514      //
515      const int instruction_1 = *(int*)(instruction_addr);
516      boint = inv_bo_field(instruction_1);
517      biint = inv_bi_field(instruction_1);
518    } else if (is_bc_far_variant2_at(instruction_addr)) {
519      // variant 2, the 2nd instruction contains the destination address:
520      //
521      //    b!cxx SKIP
522      //    bxx   DEST
523      //  SKIP:
524      //
525      const int instruction_1 = *(int*)(instruction_addr);
526      boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
527          opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
528      biint = inv_bi_field(instruction_1);
529    } else {
530      // variant 4???
531      ShouldNotReachHere();
532    }
533
534    // second, set the new branch destination and optimize the code
535    if (dest != instruction_addr + 4 && // the bc_far is still unbound!
536        masm.is_within_range_of_bcxx(dest, instruction_addr)) {
537      // variant 1:
538      //
539      //    bcxx  DEST
540      //    nop
541      //
542      masm.bc(boint, biint, dest);
543      masm.nop();
544    } else {
545      // variant 2:
546      //
547      //    b!cxx SKIP
548      //    bxx   DEST
549      //  SKIP:
550      //
551      const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
552                                                    opposite_bcond(inv_boint_bcond(boint)));
553      const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
554      masm.bc(opposite_boint, biint, not_taken_pc);
555      masm.b(dest);
556    }
557  }
558  ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
559}
560
561// Emit a NOT mt-safe patchable 64 bit absolute call/jump.
562void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
563  // get current pc
564  uint64_t start_pc = (uint64_t) pc();
565
566  const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
567  const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
568
569  // relocate here
570  if (rt != relocInfo::none) {
571    relocate(rt);
572  }
573
574  if ( ReoptimizeCallSequences &&
575       (( link && is_within_range_of_b(dest, pc_of_bl)) ||
576        (!link && is_within_range_of_b(dest, pc_of_b)))) {
577    // variant 2:
578    // Emit an optimized, pc-relative call/jump.
579
580    if (link) {
581      // some padding
582      nop();
583      nop();
584      nop();
585      nop();
586      nop();
587      nop();
588
589      // do the call
590      assert(pc() == pc_of_bl, "just checking");
591      bl(dest, relocInfo::none);
592    } else {
593      // do the jump
594      assert(pc() == pc_of_b, "just checking");
595      b(dest, relocInfo::none);
596
597      // some padding
598      nop();
599      nop();
600      nop();
601      nop();
602      nop();
603      nop();
604    }
605
606    // Assert that we can identify the emitted call/jump.
607    assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
608           "can't identify emitted call");
609  } else {
610    // variant 1:
611    mr(R0, R11);  // spill R11 -> R0.
612
613    // Load the destination address into CTR,
614    // calculate destination relative to global toc.
615    calculate_address_from_global_toc(R11, dest, true, true, false);
616
617    mtctr(R11);
618    mr(R11, R0);  // spill R11 <- R0.
619    nop();
620
621    // do the call/jump
622    if (link) {
623      bctrl();
624    } else{
625      bctr();
626    }
627    // Assert that we can identify the emitted call/jump.
628    assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
629           "can't identify emitted call");
630  }
631
632  // Assert that we can identify the emitted call/jump.
633  assert(is_bxx64_patchable_at((address)start_pc, link),
634         "can't identify emitted call");
635  assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
636         "wrong encoding of dest address");
637}
638
639// Identify a bxx64_patchable instruction.
640bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
641  return is_bxx64_patchable_variant1b_at(instruction_addr, link)
642    //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
643      || is_bxx64_patchable_variant2_at(instruction_addr, link);
644}
645
646// Does the call64_patchable instruction use a pc-relative encoding of
647// the call destination?
648bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
649  // variant 2 is pc-relative
650  return is_bxx64_patchable_variant2_at(instruction_addr, link);
651}
652
653// Identify variant 1.
654bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
655  unsigned int* instr = (unsigned int*) instruction_addr;
656  return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
657      && is_mtctr(instr[5]) // mtctr
658    && is_load_const_at(instruction_addr);
659}
660
661// Identify variant 1b: load destination relative to global toc.
662bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
663  unsigned int* instr = (unsigned int*) instruction_addr;
664  return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
665    && is_mtctr(instr[3]) // mtctr
666    && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
667}
668
669// Identify variant 2.
670bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
671  unsigned int* instr = (unsigned int*) instruction_addr;
672  if (link) {
673    return is_bl (instr[6])  // bl dest is last
674      && is_nop(instr[0])  // nop
675      && is_nop(instr[1])  // nop
676      && is_nop(instr[2])  // nop
677      && is_nop(instr[3])  // nop
678      && is_nop(instr[4])  // nop
679      && is_nop(instr[5]); // nop
680  } else {
681    return is_b  (instr[0])  // b  dest is first
682      && is_nop(instr[1])  // nop
683      && is_nop(instr[2])  // nop
684      && is_nop(instr[3])  // nop
685      && is_nop(instr[4])  // nop
686      && is_nop(instr[5])  // nop
687      && is_nop(instr[6]); // nop
688  }
689}
690
691// Set dest address of a bxx64_patchable instruction.
692void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
693  ResourceMark rm;
694  int code_size = MacroAssembler::bxx64_patchable_size;
695  CodeBuffer buf(instruction_addr, code_size);
696  MacroAssembler masm(&buf);
697  masm.bxx64_patchable(dest, relocInfo::none, link);
698  ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
699}
700
701// Get dest address of a bxx64_patchable instruction.
702address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
703  if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
704    return (address) (unsigned long) get_const(instruction_addr);
705  } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
706    unsigned int* instr = (unsigned int*) instruction_addr;
707    if (link) {
708      const int instr_idx = 6; // bl is last
709      int branchoffset = branch_destination(instr[instr_idx], 0);
710      return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
711    } else {
712      const int instr_idx = 0; // b is first
713      int branchoffset = branch_destination(instr[instr_idx], 0);
714      return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
715    }
716  // Load dest relative to global toc.
717  } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
718    return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
719                                                               instruction_addr);
720  } else {
721    ShouldNotReachHere();
722    return NULL;
723  }
724}
725
726// Uses ordering which corresponds to ABI:
727//    _savegpr0_14:  std  r14,-144(r1)
728//    _savegpr0_15:  std  r15,-136(r1)
729//    _savegpr0_16:  std  r16,-128(r1)
730void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
731  std(R14, offset, dst);   offset += 8;
732  std(R15, offset, dst);   offset += 8;
733  std(R16, offset, dst);   offset += 8;
734  std(R17, offset, dst);   offset += 8;
735  std(R18, offset, dst);   offset += 8;
736  std(R19, offset, dst);   offset += 8;
737  std(R20, offset, dst);   offset += 8;
738  std(R21, offset, dst);   offset += 8;
739  std(R22, offset, dst);   offset += 8;
740  std(R23, offset, dst);   offset += 8;
741  std(R24, offset, dst);   offset += 8;
742  std(R25, offset, dst);   offset += 8;
743  std(R26, offset, dst);   offset += 8;
744  std(R27, offset, dst);   offset += 8;
745  std(R28, offset, dst);   offset += 8;
746  std(R29, offset, dst);   offset += 8;
747  std(R30, offset, dst);   offset += 8;
748  std(R31, offset, dst);   offset += 8;
749
750  stfd(F14, offset, dst);   offset += 8;
751  stfd(F15, offset, dst);   offset += 8;
752  stfd(F16, offset, dst);   offset += 8;
753  stfd(F17, offset, dst);   offset += 8;
754  stfd(F18, offset, dst);   offset += 8;
755  stfd(F19, offset, dst);   offset += 8;
756  stfd(F20, offset, dst);   offset += 8;
757  stfd(F21, offset, dst);   offset += 8;
758  stfd(F22, offset, dst);   offset += 8;
759  stfd(F23, offset, dst);   offset += 8;
760  stfd(F24, offset, dst);   offset += 8;
761  stfd(F25, offset, dst);   offset += 8;
762  stfd(F26, offset, dst);   offset += 8;
763  stfd(F27, offset, dst);   offset += 8;
764  stfd(F28, offset, dst);   offset += 8;
765  stfd(F29, offset, dst);   offset += 8;
766  stfd(F30, offset, dst);   offset += 8;
767  stfd(F31, offset, dst);
768}
769
770// Uses ordering which corresponds to ABI:
771//    _restgpr0_14:  ld   r14,-144(r1)
772//    _restgpr0_15:  ld   r15,-136(r1)
773//    _restgpr0_16:  ld   r16,-128(r1)
774void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
775  ld(R14, offset, src);   offset += 8;
776  ld(R15, offset, src);   offset += 8;
777  ld(R16, offset, src);   offset += 8;
778  ld(R17, offset, src);   offset += 8;
779  ld(R18, offset, src);   offset += 8;
780  ld(R19, offset, src);   offset += 8;
781  ld(R20, offset, src);   offset += 8;
782  ld(R21, offset, src);   offset += 8;
783  ld(R22, offset, src);   offset += 8;
784  ld(R23, offset, src);   offset += 8;
785  ld(R24, offset, src);   offset += 8;
786  ld(R25, offset, src);   offset += 8;
787  ld(R26, offset, src);   offset += 8;
788  ld(R27, offset, src);   offset += 8;
789  ld(R28, offset, src);   offset += 8;
790  ld(R29, offset, src);   offset += 8;
791  ld(R30, offset, src);   offset += 8;
792  ld(R31, offset, src);   offset += 8;
793
794  // FP registers
795  lfd(F14, offset, src);   offset += 8;
796  lfd(F15, offset, src);   offset += 8;
797  lfd(F16, offset, src);   offset += 8;
798  lfd(F17, offset, src);   offset += 8;
799  lfd(F18, offset, src);   offset += 8;
800  lfd(F19, offset, src);   offset += 8;
801  lfd(F20, offset, src);   offset += 8;
802  lfd(F21, offset, src);   offset += 8;
803  lfd(F22, offset, src);   offset += 8;
804  lfd(F23, offset, src);   offset += 8;
805  lfd(F24, offset, src);   offset += 8;
806  lfd(F25, offset, src);   offset += 8;
807  lfd(F26, offset, src);   offset += 8;
808  lfd(F27, offset, src);   offset += 8;
809  lfd(F28, offset, src);   offset += 8;
810  lfd(F29, offset, src);   offset += 8;
811  lfd(F30, offset, src);   offset += 8;
812  lfd(F31, offset, src);
813}
814
815// For verify_oops.
816void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
817  std(R2,  offset, dst);   offset += 8;
818  std(R3,  offset, dst);   offset += 8;
819  std(R4,  offset, dst);   offset += 8;
820  std(R5,  offset, dst);   offset += 8;
821  std(R6,  offset, dst);   offset += 8;
822  std(R7,  offset, dst);   offset += 8;
823  std(R8,  offset, dst);   offset += 8;
824  std(R9,  offset, dst);   offset += 8;
825  std(R10, offset, dst);   offset += 8;
826  std(R11, offset, dst);   offset += 8;
827  std(R12, offset, dst);   offset += 8;
828
829  stfd(F0, offset, dst);   offset += 8;
830  stfd(F1, offset, dst);   offset += 8;
831  stfd(F2, offset, dst);   offset += 8;
832  stfd(F3, offset, dst);   offset += 8;
833  stfd(F4, offset, dst);   offset += 8;
834  stfd(F5, offset, dst);   offset += 8;
835  stfd(F6, offset, dst);   offset += 8;
836  stfd(F7, offset, dst);   offset += 8;
837  stfd(F8, offset, dst);   offset += 8;
838  stfd(F9, offset, dst);   offset += 8;
839  stfd(F10, offset, dst);  offset += 8;
840  stfd(F11, offset, dst);  offset += 8;
841  stfd(F12, offset, dst);  offset += 8;
842  stfd(F13, offset, dst);
843}
844
845// For verify_oops.
846void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
847  ld(R2,  offset, src);   offset += 8;
848  ld(R3,  offset, src);   offset += 8;
849  ld(R4,  offset, src);   offset += 8;
850  ld(R5,  offset, src);   offset += 8;
851  ld(R6,  offset, src);   offset += 8;
852  ld(R7,  offset, src);   offset += 8;
853  ld(R8,  offset, src);   offset += 8;
854  ld(R9,  offset, src);   offset += 8;
855  ld(R10, offset, src);   offset += 8;
856  ld(R11, offset, src);   offset += 8;
857  ld(R12, offset, src);   offset += 8;
858
859  lfd(F0, offset, src);   offset += 8;
860  lfd(F1, offset, src);   offset += 8;
861  lfd(F2, offset, src);   offset += 8;
862  lfd(F3, offset, src);   offset += 8;
863  lfd(F4, offset, src);   offset += 8;
864  lfd(F5, offset, src);   offset += 8;
865  lfd(F6, offset, src);   offset += 8;
866  lfd(F7, offset, src);   offset += 8;
867  lfd(F8, offset, src);   offset += 8;
868  lfd(F9, offset, src);   offset += 8;
869  lfd(F10, offset, src);  offset += 8;
870  lfd(F11, offset, src);  offset += 8;
871  lfd(F12, offset, src);  offset += 8;
872  lfd(F13, offset, src);
873}
874
875void MacroAssembler::save_LR_CR(Register tmp) {
876  mfcr(tmp);
877  std(tmp, _abi(cr), R1_SP);
878  mflr(tmp);
879  std(tmp, _abi(lr), R1_SP);
880  // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
881}
882
883void MacroAssembler::restore_LR_CR(Register tmp) {
884  assert(tmp != R1_SP, "must be distinct");
885  ld(tmp, _abi(lr), R1_SP);
886  mtlr(tmp);
887  ld(tmp, _abi(cr), R1_SP);
888  mtcr(tmp);
889}
890
891address MacroAssembler::get_PC_trash_LR(Register result) {
892  Label L;
893  bl(L);
894  bind(L);
895  address lr_pc = pc();
896  mflr(result);
897  return lr_pc;
898}
899
900void MacroAssembler::resize_frame(Register offset, Register tmp) {
901#ifdef ASSERT
902  assert_different_registers(offset, tmp, R1_SP);
903  andi_(tmp, offset, frame::alignment_in_bytes-1);
904  asm_assert_eq("resize_frame: unaligned", 0x204);
905#endif
906
907  // tmp <- *(SP)
908  ld(tmp, _abi(callers_sp), R1_SP);
909  // addr <- SP + offset;
910  // *(addr) <- tmp;
911  // SP <- addr
912  stdux(tmp, R1_SP, offset);
913}
914
915void MacroAssembler::resize_frame(int offset, Register tmp) {
916  assert(is_simm(offset, 16), "too big an offset");
917  assert_different_registers(tmp, R1_SP);
918  assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
919  // tmp <- *(SP)
920  ld(tmp, _abi(callers_sp), R1_SP);
921  // addr <- SP + offset;
922  // *(addr) <- tmp;
923  // SP <- addr
924  stdu(tmp, offset, R1_SP);
925}
926
927void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
928  // (addr == tmp1) || (addr == tmp2) is allowed here!
929  assert(tmp1 != tmp2, "must be distinct");
930
931  // compute offset w.r.t. current stack pointer
932  // tmp_1 <- addr - SP (!)
933  subf(tmp1, R1_SP, addr);
934
935  // atomically update SP keeping back link.
936  resize_frame(tmp1/* offset */, tmp2/* tmp */);
937}
938
939void MacroAssembler::push_frame(Register bytes, Register tmp) {
940#ifdef ASSERT
941  assert(bytes != R0, "r0 not allowed here");
942  andi_(R0, bytes, frame::alignment_in_bytes-1);
943  asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
944#endif
945  neg(tmp, bytes);
946  stdux(R1_SP, R1_SP, tmp);
947}
948
949// Push a frame of size `bytes'.
950void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
951  long offset = align_addr(bytes, frame::alignment_in_bytes);
952  if (is_simm(-offset, 16)) {
953    stdu(R1_SP, -offset, R1_SP);
954  } else {
955    load_const_optimized(tmp, -offset);
956    stdux(R1_SP, R1_SP, tmp);
957  }
958}
959
960// Push a frame of size `bytes' plus abi_reg_args on top.
961void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
962  push_frame(bytes + frame::abi_reg_args_size, tmp);
963}
964
965// Setup up a new C frame with a spill area for non-volatile GPRs and
966// additional space for local variables.
967void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
968                                                      Register tmp) {
969  push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
970}
971
972// Pop current C frame.
973void MacroAssembler::pop_frame() {
974  ld(R1_SP, _abi(callers_sp), R1_SP);
975}
976
977#if defined(ABI_ELFv2)
978address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
979  // TODO(asmundak): make sure the caller uses R12 as function descriptor
980  // most of the times.
981  if (R12 != r_function_entry) {
982    mr(R12, r_function_entry);
983  }
984  mtctr(R12);
985  // Do a call or a branch.
986  if (and_link) {
987    bctrl();
988  } else {
989    bctr();
990  }
991  _last_calls_return_pc = pc();
992
993  return _last_calls_return_pc;
994}
995
996// Call a C function via a function descriptor and use full C
997// calling conventions. Updates and returns _last_calls_return_pc.
998address MacroAssembler::call_c(Register r_function_entry) {
999  return branch_to(r_function_entry, /*and_link=*/true);
1000}
1001
1002// For tail calls: only branch, don't link, so callee returns to caller of this function.
1003address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1004  return branch_to(r_function_entry, /*and_link=*/false);
1005}
1006
1007address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1008  load_const(R12, function_entry, R0);
1009  return branch_to(R12,  /*and_link=*/true);
1010}
1011
1012#else
1013// Generic version of a call to C function via a function descriptor
1014// with variable support for C calling conventions (TOC, ENV, etc.).
1015// Updates and returns _last_calls_return_pc.
1016address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1017                                  bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1018  // we emit standard ptrgl glue code here
1019  assert((function_descriptor != R0), "function_descriptor cannot be R0");
1020
1021  // retrieve necessary entries from the function descriptor
1022  ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1023  mtctr(R0);
1024
1025  if (load_toc_of_callee) {
1026    ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1027  }
1028  if (load_env_of_callee) {
1029    ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1030  } else if (load_toc_of_callee) {
1031    li(R11, 0);
1032  }
1033
1034  // do a call or a branch
1035  if (and_link) {
1036    bctrl();
1037  } else {
1038    bctr();
1039  }
1040  _last_calls_return_pc = pc();
1041
1042  return _last_calls_return_pc;
1043}
1044
1045// Call a C function via a function descriptor and use full C calling
1046// conventions.
1047// We don't use the TOC in generated code, so there is no need to save
1048// and restore its value.
1049address MacroAssembler::call_c(Register fd) {
1050  return branch_to(fd, /*and_link=*/true,
1051                       /*save toc=*/false,
1052                       /*restore toc=*/false,
1053                       /*load toc=*/true,
1054                       /*load env=*/true);
1055}
1056
1057address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1058  return branch_to(fd, /*and_link=*/false,
1059                       /*save toc=*/false,
1060                       /*restore toc=*/false,
1061                       /*load toc=*/true,
1062                       /*load env=*/true);
1063}
1064
1065address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1066  if (rt != relocInfo::none) {
1067    // this call needs to be relocatable
1068    if (!ReoptimizeCallSequences
1069        || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1070        || fd == NULL   // support code-size estimation
1071        || !fd->is_friend_function()
1072        || fd->entry() == NULL) {
1073      // it's not a friend function as defined by class FunctionDescriptor,
1074      // so do a full call-c here.
1075      load_const(R11, (address)fd, R0);
1076
1077      bool has_env = (fd != NULL && fd->env() != NULL);
1078      return branch_to(R11, /*and_link=*/true,
1079                            /*save toc=*/false,
1080                            /*restore toc=*/false,
1081                            /*load toc=*/true,
1082                            /*load env=*/has_env);
1083    } else {
1084      // It's a friend function. Load the entry point and don't care about
1085      // toc and env. Use an optimizable call instruction, but ensure the
1086      // same code-size as in the case of a non-friend function.
1087      nop();
1088      nop();
1089      nop();
1090      bl64_patchable(fd->entry(), rt);
1091      _last_calls_return_pc = pc();
1092      return _last_calls_return_pc;
1093    }
1094  } else {
1095    // This call does not need to be relocatable, do more aggressive
1096    // optimizations.
1097    if (!ReoptimizeCallSequences
1098      || !fd->is_friend_function()) {
1099      // It's not a friend function as defined by class FunctionDescriptor,
1100      // so do a full call-c here.
1101      load_const(R11, (address)fd, R0);
1102      return branch_to(R11, /*and_link=*/true,
1103                            /*save toc=*/false,
1104                            /*restore toc=*/false,
1105                            /*load toc=*/true,
1106                            /*load env=*/true);
1107    } else {
1108      // it's a friend function, load the entry point and don't care about
1109      // toc and env.
1110      address dest = fd->entry();
1111      if (is_within_range_of_b(dest, pc())) {
1112        bl(dest);
1113      } else {
1114        bl64_patchable(dest, rt);
1115      }
1116      _last_calls_return_pc = pc();
1117      return _last_calls_return_pc;
1118    }
1119  }
1120}
1121
1122// Call a C function.  All constants needed reside in TOC.
1123//
1124// Read the address to call from the TOC.
1125// Read env from TOC, if fd specifies an env.
1126// Read new TOC from TOC.
1127address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1128                                         relocInfo::relocType rt, Register toc) {
1129  if (!ReoptimizeCallSequences
1130    || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1131    || !fd->is_friend_function()) {
1132    // It's not a friend function as defined by class FunctionDescriptor,
1133    // so do a full call-c here.
1134    assert(fd->entry() != NULL, "function must be linked");
1135
1136    AddressLiteral fd_entry(fd->entry());
1137    bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1138    mtctr(R11);
1139    if (fd->env() == NULL) {
1140      li(R11, 0);
1141      nop();
1142    } else {
1143      AddressLiteral fd_env(fd->env());
1144      success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1145    }
1146    AddressLiteral fd_toc(fd->toc());
1147    // Set R2_TOC (load from toc)
1148    success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1149    bctrl();
1150    _last_calls_return_pc = pc();
1151    if (!success) { return NULL; }
1152  } else {
1153    // It's a friend function, load the entry point and don't care about
1154    // toc and env. Use an optimizable call instruction, but ensure the
1155    // same code-size as in the case of a non-friend function.
1156    nop();
1157    bl64_patchable(fd->entry(), rt);
1158    _last_calls_return_pc = pc();
1159  }
1160  return _last_calls_return_pc;
1161}
1162#endif // ABI_ELFv2
1163
1164void MacroAssembler::call_VM_base(Register oop_result,
1165                                  Register last_java_sp,
1166                                  address  entry_point,
1167                                  bool     check_exceptions) {
1168  BLOCK_COMMENT("call_VM {");
1169  // Determine last_java_sp register.
1170  if (!last_java_sp->is_valid()) {
1171    last_java_sp = R1_SP;
1172  }
1173  set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1174
1175  // ARG1 must hold thread address.
1176  mr(R3_ARG1, R16_thread);
1177#if defined(ABI_ELFv2)
1178  address return_pc = call_c(entry_point, relocInfo::none);
1179#else
1180  address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1181#endif
1182
1183  reset_last_Java_frame();
1184
1185  // Check for pending exceptions.
1186  if (check_exceptions) {
1187    // We don't check for exceptions here.
1188    ShouldNotReachHere();
1189  }
1190
1191  // Get oop result if there is one and reset the value in the thread.
1192  if (oop_result->is_valid()) {
1193    get_vm_result(oop_result);
1194  }
1195
1196  _last_calls_return_pc = return_pc;
1197  BLOCK_COMMENT("} call_VM");
1198}
1199
1200void MacroAssembler::call_VM_leaf_base(address entry_point) {
1201  BLOCK_COMMENT("call_VM_leaf {");
1202#if defined(ABI_ELFv2)
1203  call_c(entry_point, relocInfo::none);
1204#else
1205  call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1206#endif
1207  BLOCK_COMMENT("} call_VM_leaf");
1208}
1209
1210void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1211  call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1212}
1213
1214void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1215                             bool check_exceptions) {
1216  // R3_ARG1 is reserved for the thread.
1217  mr_if_needed(R4_ARG2, arg_1);
1218  call_VM(oop_result, entry_point, check_exceptions);
1219}
1220
1221void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1222                             bool check_exceptions) {
1223  // R3_ARG1 is reserved for the thread
1224  mr_if_needed(R4_ARG2, arg_1);
1225  assert(arg_2 != R4_ARG2, "smashed argument");
1226  mr_if_needed(R5_ARG3, arg_2);
1227  call_VM(oop_result, entry_point, check_exceptions);
1228}
1229
1230void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1231                             bool check_exceptions) {
1232  // R3_ARG1 is reserved for the thread
1233  mr_if_needed(R4_ARG2, arg_1);
1234  assert(arg_2 != R4_ARG2, "smashed argument");
1235  mr_if_needed(R5_ARG3, arg_2);
1236  mr_if_needed(R6_ARG4, arg_3);
1237  call_VM(oop_result, entry_point, check_exceptions);
1238}
1239
1240void MacroAssembler::call_VM_leaf(address entry_point) {
1241  call_VM_leaf_base(entry_point);
1242}
1243
1244void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1245  mr_if_needed(R3_ARG1, arg_1);
1246  call_VM_leaf(entry_point);
1247}
1248
1249void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1250  mr_if_needed(R3_ARG1, arg_1);
1251  assert(arg_2 != R3_ARG1, "smashed argument");
1252  mr_if_needed(R4_ARG2, arg_2);
1253  call_VM_leaf(entry_point);
1254}
1255
1256void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1257  mr_if_needed(R3_ARG1, arg_1);
1258  assert(arg_2 != R3_ARG1, "smashed argument");
1259  mr_if_needed(R4_ARG2, arg_2);
1260  assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1261  mr_if_needed(R5_ARG3, arg_3);
1262  call_VM_leaf(entry_point);
1263}
1264
1265// Check whether instruction is a read access to the polling page
1266// which was emitted by load_from_polling_page(..).
1267bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1268                                               address* polling_address_ptr) {
1269  if (!is_ld(instruction))
1270    return false; // It's not a ld. Fail.
1271
1272  int rt = inv_rt_field(instruction);
1273  int ra = inv_ra_field(instruction);
1274  int ds = inv_ds_field(instruction);
1275  if (!(ds == 0 && ra != 0 && rt == 0)) {
1276    return false; // It's not a ld(r0, X, ra). Fail.
1277  }
1278
1279  if (!ucontext) {
1280    // Set polling address.
1281    if (polling_address_ptr != NULL) {
1282      *polling_address_ptr = NULL;
1283    }
1284    return true; // No ucontext given. Can't check value of ra. Assume true.
1285  }
1286
1287#ifdef LINUX
1288  // Ucontext given. Check that register ra contains the address of
1289  // the safepoing polling page.
1290  ucontext_t* uc = (ucontext_t*) ucontext;
1291  // Set polling address.
1292  address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1293  if (polling_address_ptr != NULL) {
1294    *polling_address_ptr = addr;
1295  }
1296  return os::is_poll_address(addr);
1297#else
1298  // Not on Linux, ucontext must be NULL.
1299  ShouldNotReachHere();
1300  return false;
1301#endif
1302}
1303
1304bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1305#ifdef LINUX
1306  ucontext_t* uc = (ucontext_t*) ucontext;
1307
1308  if (is_stwx(instruction) || is_stwux(instruction)) {
1309    int ra = inv_ra_field(instruction);
1310    int rb = inv_rb_field(instruction);
1311
1312    // look up content of ra and rb in ucontext
1313    address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1314    long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1315    return os::is_memory_serialize_page(thread, ra_val+rb_val);
1316  } else if (is_stw(instruction) || is_stwu(instruction)) {
1317    int ra = inv_ra_field(instruction);
1318    int d1 = inv_d1_field(instruction);
1319
1320    // look up content of ra in ucontext
1321    address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1322    return os::is_memory_serialize_page(thread, ra_val+d1);
1323  } else {
1324    return false;
1325  }
1326#else
1327  // workaround not needed on !LINUX :-)
1328  ShouldNotCallThis();
1329  return false;
1330#endif
1331}
1332
1333void MacroAssembler::bang_stack_with_offset(int offset) {
1334  // When increasing the stack, the old stack pointer will be written
1335  // to the new top of stack according to the PPC64 abi.
1336  // Therefore, stack banging is not necessary when increasing
1337  // the stack by <= os::vm_page_size() bytes.
1338  // When increasing the stack by a larger amount, this method is
1339  // called repeatedly to bang the intermediate pages.
1340
1341  // Stack grows down, caller passes positive offset.
1342  assert(offset > 0, "must bang with positive offset");
1343
1344  long stdoffset = -offset;
1345
1346  if (is_simm(stdoffset, 16)) {
1347    // Signed 16 bit offset, a simple std is ok.
1348    if (UseLoadInstructionsForStackBangingPPC64) {
1349      ld(R0, (int)(signed short)stdoffset, R1_SP);
1350    } else {
1351      std(R0,(int)(signed short)stdoffset, R1_SP);
1352    }
1353  } else if (is_simm(stdoffset, 31)) {
1354    const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1355    const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1356
1357    Register tmp = R11;
1358    addis(tmp, R1_SP, hi);
1359    if (UseLoadInstructionsForStackBangingPPC64) {
1360      ld(R0,  lo, tmp);
1361    } else {
1362      std(R0, lo, tmp);
1363    }
1364  } else {
1365    ShouldNotReachHere();
1366  }
1367}
1368
1369// If instruction is a stack bang of the form
1370//    std    R0,    x(Ry),       (see bang_stack_with_offset())
1371//    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1372// or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1373// return the banged address. Otherwise, return 0.
1374address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1375#ifdef LINUX
1376  ucontext_t* uc = (ucontext_t*) ucontext;
1377  int rs = inv_rs_field(instruction);
1378  int ra = inv_ra_field(instruction);
1379  if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1380      || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1381      || (is_stdu(instruction) && rs == 1)) {
1382    int ds = inv_ds_field(instruction);
1383    // return banged address
1384    return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1385  } else if (is_stdux(instruction) && rs == 1) {
1386    int rb = inv_rb_field(instruction);
1387    address sp = (address)uc->uc_mcontext.regs->gpr[1];
1388    long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1389    return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1390                                  : sp + rb_val; // banged address
1391  }
1392  return NULL; // not a stack bang
1393#else
1394  // workaround not needed on !LINUX :-)
1395  ShouldNotCallThis();
1396  return NULL;
1397#endif
1398}
1399
1400// CmpxchgX sets condition register to cmpX(current, compare).
1401void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value,
1402                              Register compare_value, Register exchange_value,
1403                              Register addr_base, int semantics, bool cmpxchgx_hint,
1404                              Register int_flag_success, bool contention_hint) {
1405  Label retry;
1406  Label failed;
1407  Label done;
1408
1409  // Save one branch if result is returned via register and
1410  // result register is different from the other ones.
1411  bool use_result_reg    = (int_flag_success != noreg);
1412  bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1413                            int_flag_success != exchange_value && int_flag_success != addr_base);
1414
1415  if (use_result_reg && preset_result_reg) {
1416    li(int_flag_success, 0); // preset (assume cas failed)
1417  }
1418
1419  // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1420  if (contention_hint) { // Don't try to reserve if cmp fails.
1421    lwz(dest_current_value, 0, addr_base);
1422    cmpw(flag, dest_current_value, compare_value);
1423    bne(flag, failed);
1424  }
1425
1426  // release/fence semantics
1427  if (semantics & MemBarRel) {
1428    release();
1429  }
1430
1431  // atomic emulation loop
1432  bind(retry);
1433
1434  lwarx(dest_current_value, addr_base, cmpxchgx_hint);
1435  cmpw(flag, dest_current_value, compare_value);
1436  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1437    bne_predict_not_taken(flag, failed);
1438  } else {
1439    bne(                  flag, failed);
1440  }
1441  // branch to done  => (flag == ne), (dest_current_value != compare_value)
1442  // fall through    => (flag == eq), (dest_current_value == compare_value)
1443
1444  stwcx_(exchange_value, addr_base);
1445  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1446    bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1447  } else {
1448    bne(                  CCR0, retry); // StXcx_ sets CCR0.
1449  }
1450  // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1451
1452  // Result in register (must do this at the end because int_flag_success can be the
1453  // same register as one above).
1454  if (use_result_reg) {
1455    li(int_flag_success, 1);
1456  }
1457
1458  if (semantics & MemBarFenceAfter) {
1459    fence();
1460  } else if (semantics & MemBarAcq) {
1461    isync();
1462  }
1463
1464  if (use_result_reg && !preset_result_reg) {
1465    b(done);
1466  }
1467
1468  bind(failed);
1469  if (use_result_reg && !preset_result_reg) {
1470    li(int_flag_success, 0);
1471  }
1472
1473  bind(done);
1474  // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1475  // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1476}
1477
1478// Preforms atomic compare exchange:
1479//   if (compare_value == *addr_base)
1480//     *addr_base = exchange_value
1481//     int_flag_success = 1;
1482//   else
1483//     int_flag_success = 0;
1484//
1485// ConditionRegister flag       = cmp(compare_value, *addr_base)
1486// Register dest_current_value  = *addr_base
1487// Register compare_value       Used to compare with value in memory
1488// Register exchange_value      Written to memory if compare_value == *addr_base
1489// Register addr_base           The memory location to compareXChange
1490// Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1491//
1492// To avoid the costly compare exchange the value is tested beforehand.
1493// Several special cases exist to avoid that unnecessary information is generated.
1494//
1495void MacroAssembler::cmpxchgd(ConditionRegister flag,
1496                              Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1497                              Register addr_base, int semantics, bool cmpxchgx_hint,
1498                              Register int_flag_success, Label* failed_ext, bool contention_hint) {
1499  Label retry;
1500  Label failed_int;
1501  Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1502  Label done;
1503
1504  // Save one branch if result is returned via register and result register is different from the other ones.
1505  bool use_result_reg    = (int_flag_success!=noreg);
1506  bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1507                            int_flag_success!=exchange_value && int_flag_success!=addr_base);
1508  assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1509
1510  if (use_result_reg && preset_result_reg) {
1511    li(int_flag_success, 0); // preset (assume cas failed)
1512  }
1513
1514  // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1515  if (contention_hint) { // Don't try to reserve if cmp fails.
1516    ld(dest_current_value, 0, addr_base);
1517    cmpd(flag, compare_value, dest_current_value);
1518    bne(flag, failed);
1519  }
1520
1521  // release/fence semantics
1522  if (semantics & MemBarRel) {
1523    release();
1524  }
1525
1526  // atomic emulation loop
1527  bind(retry);
1528
1529  ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1530  cmpd(flag, compare_value, dest_current_value);
1531  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1532    bne_predict_not_taken(flag, failed);
1533  } else {
1534    bne(                  flag, failed);
1535  }
1536
1537  stdcx_(exchange_value, addr_base);
1538  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1539    bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
1540  } else {
1541    bne(                  CCR0, retry); // stXcx_ sets CCR0
1542  }
1543
1544  // result in register (must do this at the end because int_flag_success can be the same register as one above)
1545  if (use_result_reg) {
1546    li(int_flag_success, 1);
1547  }
1548
1549  if (semantics & MemBarFenceAfter) {
1550    fence();
1551  } else if (semantics & MemBarAcq) {
1552    isync();
1553  }
1554
1555  if (use_result_reg && !preset_result_reg) {
1556    b(done);
1557  }
1558
1559  bind(failed_int);
1560  if (use_result_reg && !preset_result_reg) {
1561    li(int_flag_success, 0);
1562  }
1563
1564  bind(done);
1565  // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1566  // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1567}
1568
1569// Look up the method for a megamorphic invokeinterface call.
1570// The target method is determined by <intf_klass, itable_index>.
1571// The receiver klass is in recv_klass.
1572// On success, the result will be in method_result, and execution falls through.
1573// On failure, execution transfers to the given label.
1574void MacroAssembler::lookup_interface_method(Register recv_klass,
1575                                             Register intf_klass,
1576                                             RegisterOrConstant itable_index,
1577                                             Register method_result,
1578                                             Register scan_temp,
1579                                             Register sethi_temp,
1580                                             Label& L_no_such_interface) {
1581  assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1582  assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1583         "caller must use same register for non-constant itable index as for method");
1584
1585  // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1586  int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
1587  int itentry_off = itableMethodEntry::method_offset_in_bytes();
1588  int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1589  int scan_step   = itableOffsetEntry::size() * wordSize;
1590  int log_vte_size= exact_log2(vtableEntry::size() * wordSize);
1591
1592  lwz(scan_temp, InstanceKlass::vtable_length_offset() * wordSize, recv_klass);
1593  // %%% We should store the aligned, prescaled offset in the klassoop.
1594  // Then the next several instructions would fold away.
1595
1596  sldi(scan_temp, scan_temp, log_vte_size);
1597  addi(scan_temp, scan_temp, vtable_base);
1598  add(scan_temp, recv_klass, scan_temp);
1599
1600  // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1601  if (itable_index.is_register()) {
1602    Register itable_offset = itable_index.as_register();
1603    sldi(itable_offset, itable_offset, logMEsize);
1604    if (itentry_off) addi(itable_offset, itable_offset, itentry_off);
1605    add(recv_klass, itable_offset, recv_klass);
1606  } else {
1607    long itable_offset = (long)itable_index.as_constant();
1608    load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation
1609    add(recv_klass, sethi_temp, recv_klass);
1610  }
1611
1612  // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1613  //   if (scan->interface() == intf) {
1614  //     result = (klass + scan->offset() + itable_index);
1615  //   }
1616  // }
1617  Label search, found_method;
1618
1619  for (int peel = 1; peel >= 0; peel--) {
1620    // %%%% Could load both offset and interface in one ldx, if they were
1621    // in the opposite order. This would save a load.
1622    ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1623
1624    // Check that this entry is non-null. A null entry means that
1625    // the receiver class doesn't implement the interface, and wasn't the
1626    // same as when the caller was compiled.
1627    cmpd(CCR0, method_result, intf_klass);
1628
1629    if (peel) {
1630      beq(CCR0, found_method);
1631    } else {
1632      bne(CCR0, search);
1633      // (invert the test to fall through to found_method...)
1634    }
1635
1636    if (!peel) break;
1637
1638    bind(search);
1639
1640    cmpdi(CCR0, method_result, 0);
1641    beq(CCR0, L_no_such_interface);
1642    addi(scan_temp, scan_temp, scan_step);
1643  }
1644
1645  bind(found_method);
1646
1647  // Got a hit.
1648  int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1649  lwz(scan_temp, ito_offset, scan_temp);
1650  ldx(method_result, scan_temp, recv_klass);
1651}
1652
1653// virtual method calling
1654void MacroAssembler::lookup_virtual_method(Register recv_klass,
1655                                           RegisterOrConstant vtable_index,
1656                                           Register method_result) {
1657
1658  assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1659
1660  const int base = InstanceKlass::vtable_start_offset() * wordSize;
1661  assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1662
1663  if (vtable_index.is_register()) {
1664    sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1665    add(recv_klass, vtable_index.as_register(), recv_klass);
1666  } else {
1667    addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1668  }
1669  ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1670}
1671
1672/////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1673void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1674                                                   Register super_klass,
1675                                                   Register temp1_reg,
1676                                                   Register temp2_reg,
1677                                                   Label* L_success,
1678                                                   Label* L_failure,
1679                                                   Label* L_slow_path,
1680                                                   RegisterOrConstant super_check_offset) {
1681
1682  const Register check_cache_offset = temp1_reg;
1683  const Register cached_super       = temp2_reg;
1684
1685  assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1686
1687  int sco_offset = in_bytes(Klass::super_check_offset_offset());
1688  int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1689
1690  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1691  bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1692
1693  Label L_fallthrough;
1694  int label_nulls = 0;
1695  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1696  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1697  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1698  assert(label_nulls <= 1 ||
1699         (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1700         "at most one NULL in the batch, usually");
1701
1702  // If the pointers are equal, we are done (e.g., String[] elements).
1703  // This self-check enables sharing of secondary supertype arrays among
1704  // non-primary types such as array-of-interface. Otherwise, each such
1705  // type would need its own customized SSA.
1706  // We move this check to the front of the fast path because many
1707  // type checks are in fact trivially successful in this manner,
1708  // so we get a nicely predicted branch right at the start of the check.
1709  cmpd(CCR0, sub_klass, super_klass);
1710  beq(CCR0, *L_success);
1711
1712  // Check the supertype display:
1713  if (must_load_sco) {
1714    // The super check offset is always positive...
1715  lwz(check_cache_offset, sco_offset, super_klass);
1716    super_check_offset = RegisterOrConstant(check_cache_offset);
1717    // super_check_offset is register.
1718    assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1719  }
1720  // The loaded value is the offset from KlassOopDesc.
1721
1722  ld(cached_super, super_check_offset, sub_klass);
1723  cmpd(CCR0, cached_super, super_klass);
1724
1725  // This check has worked decisively for primary supers.
1726  // Secondary supers are sought in the super_cache ('super_cache_addr').
1727  // (Secondary supers are interfaces and very deeply nested subtypes.)
1728  // This works in the same check above because of a tricky aliasing
1729  // between the super_cache and the primary super display elements.
1730  // (The 'super_check_addr' can address either, as the case requires.)
1731  // Note that the cache is updated below if it does not help us find
1732  // what we need immediately.
1733  // So if it was a primary super, we can just fail immediately.
1734  // Otherwise, it's the slow path for us (no success at this point).
1735
1736#define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1737
1738  if (super_check_offset.is_register()) {
1739    beq(CCR0, *L_success);
1740    cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1741    if (L_failure == &L_fallthrough) {
1742      beq(CCR0, *L_slow_path);
1743    } else {
1744      bne(CCR0, *L_failure);
1745      FINAL_JUMP(*L_slow_path);
1746    }
1747  } else {
1748    if (super_check_offset.as_constant() == sc_offset) {
1749      // Need a slow path; fast failure is impossible.
1750      if (L_slow_path == &L_fallthrough) {
1751        beq(CCR0, *L_success);
1752      } else {
1753        bne(CCR0, *L_slow_path);
1754        FINAL_JUMP(*L_success);
1755      }
1756    } else {
1757      // No slow path; it's a fast decision.
1758      if (L_failure == &L_fallthrough) {
1759        beq(CCR0, *L_success);
1760      } else {
1761        bne(CCR0, *L_failure);
1762        FINAL_JUMP(*L_success);
1763      }
1764    }
1765  }
1766
1767  bind(L_fallthrough);
1768#undef FINAL_JUMP
1769}
1770
1771void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1772                                                   Register super_klass,
1773                                                   Register temp1_reg,
1774                                                   Register temp2_reg,
1775                                                   Label* L_success,
1776                                                   Register result_reg) {
1777  const Register array_ptr = temp1_reg; // current value from cache array
1778  const Register temp      = temp2_reg;
1779
1780  assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1781
1782  int source_offset = in_bytes(Klass::secondary_supers_offset());
1783  int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1784
1785  int length_offset = Array<Klass*>::length_offset_in_bytes();
1786  int base_offset   = Array<Klass*>::base_offset_in_bytes();
1787
1788  Label hit, loop, failure, fallthru;
1789
1790  ld(array_ptr, source_offset, sub_klass);
1791
1792  // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1793  lwz(temp, length_offset, array_ptr);
1794  cmpwi(CCR0, temp, 0);
1795  beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1796
1797  mtctr(temp); // load ctr
1798
1799  bind(loop);
1800  // Oops in table are NO MORE compressed.
1801  ld(temp, base_offset, array_ptr);
1802  cmpd(CCR0, temp, super_klass);
1803  beq(CCR0, hit);
1804  addi(array_ptr, array_ptr, BytesPerWord);
1805  bdnz(loop);
1806
1807  bind(failure);
1808  if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
1809  b(fallthru);
1810
1811  bind(hit);
1812  std(super_klass, target_offset, sub_klass); // save result to cache
1813  if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
1814  if (L_success != NULL) { b(*L_success); }
1815  else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
1816
1817  bind(fallthru);
1818}
1819
1820// Try fast path, then go to slow one if not successful
1821void MacroAssembler::check_klass_subtype(Register sub_klass,
1822                         Register super_klass,
1823                         Register temp1_reg,
1824                         Register temp2_reg,
1825                         Label& L_success) {
1826  Label L_failure;
1827  check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
1828  check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
1829  bind(L_failure); // Fallthru if not successful.
1830}
1831
1832void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
1833                                              Register temp_reg,
1834                                              Label& wrong_method_type) {
1835  assert_different_registers(mtype_reg, mh_reg, temp_reg);
1836  // Compare method type against that of the receiver.
1837  load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
1838  cmpd(CCR0, temp_reg, mtype_reg);
1839  bne(CCR0, wrong_method_type);
1840}
1841
1842RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
1843                                                   Register temp_reg,
1844                                                   int extra_slot_offset) {
1845  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1846  int stackElementSize = Interpreter::stackElementSize;
1847  int offset = extra_slot_offset * stackElementSize;
1848  if (arg_slot.is_constant()) {
1849    offset += arg_slot.as_constant() * stackElementSize;
1850    return offset;
1851  } else {
1852    assert(temp_reg != noreg, "must specify");
1853    sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
1854    if (offset != 0)
1855      addi(temp_reg, temp_reg, offset);
1856    return temp_reg;
1857  }
1858}
1859
1860// Supports temp2_reg = R0.
1861void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
1862                                          Register mark_reg, Register temp_reg,
1863                                          Register temp2_reg, Label& done, Label* slow_case) {
1864  assert(UseBiasedLocking, "why call this otherwise?");
1865
1866#ifdef ASSERT
1867  assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
1868#endif
1869
1870  Label cas_label;
1871
1872  // Branch to done if fast path fails and no slow_case provided.
1873  Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
1874
1875  // Biased locking
1876  // See whether the lock is currently biased toward our thread and
1877  // whether the epoch is still valid
1878  // Note that the runtime guarantees sufficient alignment of JavaThread
1879  // pointers to allow age to be placed into low bits
1880  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
1881         "biased locking makes assumptions about bit layout");
1882
1883  if (PrintBiasedLockingStatistics) {
1884    load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
1885    lwzx(temp_reg, temp2_reg);
1886    addi(temp_reg, temp_reg, 1);
1887    stwx(temp_reg, temp2_reg);
1888  }
1889
1890  andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
1891  cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1892  bne(cr_reg, cas_label);
1893
1894  load_klass(temp_reg, obj_reg);
1895
1896  load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
1897  ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1898  orr(temp_reg, R16_thread, temp_reg);
1899  xorr(temp_reg, mark_reg, temp_reg);
1900  andr(temp_reg, temp_reg, temp2_reg);
1901  cmpdi(cr_reg, temp_reg, 0);
1902  if (PrintBiasedLockingStatistics) {
1903    Label l;
1904    bne(cr_reg, l);
1905    load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
1906    lwzx(mark_reg, temp2_reg);
1907    addi(mark_reg, mark_reg, 1);
1908    stwx(mark_reg, temp2_reg);
1909    // restore mark_reg
1910    ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1911    bind(l);
1912  }
1913  beq(cr_reg, done);
1914
1915  Label try_revoke_bias;
1916  Label try_rebias;
1917
1918  // At this point we know that the header has the bias pattern and
1919  // that we are not the bias owner in the current epoch. We need to
1920  // figure out more details about the state of the header in order to
1921  // know what operations can be legally performed on the object's
1922  // header.
1923
1924  // If the low three bits in the xor result aren't clear, that means
1925  // the prototype header is no longer biased and we have to revoke
1926  // the bias on this object.
1927  andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1928  cmpwi(cr_reg, temp2_reg, 0);
1929  bne(cr_reg, try_revoke_bias);
1930
1931  // Biasing is still enabled for this data type. See whether the
1932  // epoch of the current bias is still valid, meaning that the epoch
1933  // bits of the mark word are equal to the epoch bits of the
1934  // prototype header. (Note that the prototype header's epoch bits
1935  // only change at a safepoint.) If not, attempt to rebias the object
1936  // toward the current thread. Note that we must be absolutely sure
1937  // that the current epoch is invalid in order to do this because
1938  // otherwise the manipulations it performs on the mark word are
1939  // illegal.
1940
1941  int shift_amount = 64 - markOopDesc::epoch_shift;
1942  // rotate epoch bits to right (little) end and set other bits to 0
1943  // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
1944  rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
1945  // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
1946  bne(CCR0, try_rebias);
1947
1948  // The epoch of the current bias is still valid but we know nothing
1949  // about the owner; it might be set or it might be clear. Try to
1950  // acquire the bias of the object using an atomic operation. If this
1951  // fails we will go in to the runtime to revoke the object's bias.
1952  // Note that we first construct the presumed unbiased header so we
1953  // don't accidentally blow away another thread's valid bias.
1954  andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
1955                                markOopDesc::age_mask_in_place |
1956                                markOopDesc::epoch_mask_in_place));
1957  orr(temp_reg, R16_thread, mark_reg);
1958
1959  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1960
1961  // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1962  cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1963           /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1964           /*where=*/obj_reg,
1965           MacroAssembler::MemBarAcq,
1966           MacroAssembler::cmpxchgx_hint_acquire_lock(),
1967           noreg, slow_case_int); // bail out if failed
1968
1969  // If the biasing toward our thread failed, this means that
1970  // another thread succeeded in biasing it toward itself and we
1971  // need to revoke that bias. The revocation will occur in the
1972  // interpreter runtime in the slow case.
1973  if (PrintBiasedLockingStatistics) {
1974    load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
1975    lwzx(temp_reg, temp2_reg);
1976    addi(temp_reg, temp_reg, 1);
1977    stwx(temp_reg, temp2_reg);
1978  }
1979  b(done);
1980
1981  bind(try_rebias);
1982  // At this point we know the epoch has expired, meaning that the
1983  // current "bias owner", if any, is actually invalid. Under these
1984  // circumstances _only_, we are allowed to use the current header's
1985  // value as the comparison value when doing the cas to acquire the
1986  // bias in the current epoch. In other words, we allow transfer of
1987  // the bias from one thread to another directly in this situation.
1988  load_klass(temp_reg, obj_reg);
1989  andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
1990  orr(temp2_reg, R16_thread, temp2_reg);
1991  ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1992  orr(temp_reg, temp2_reg, temp_reg);
1993
1994  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1995
1996  cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1997                 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1998                 /*where=*/obj_reg,
1999                 MacroAssembler::MemBarAcq,
2000                 MacroAssembler::cmpxchgx_hint_acquire_lock(),
2001                 noreg, slow_case_int); // bail out if failed
2002
2003  // If the biasing toward our thread failed, this means that
2004  // another thread succeeded in biasing it toward itself and we
2005  // need to revoke that bias. The revocation will occur in the
2006  // interpreter runtime in the slow case.
2007  if (PrintBiasedLockingStatistics) {
2008    load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2009    lwzx(temp_reg, temp2_reg);
2010    addi(temp_reg, temp_reg, 1);
2011    stwx(temp_reg, temp2_reg);
2012  }
2013  b(done);
2014
2015  bind(try_revoke_bias);
2016  // The prototype mark in the klass doesn't have the bias bit set any
2017  // more, indicating that objects of this data type are not supposed
2018  // to be biased any more. We are going to try to reset the mark of
2019  // this object to the prototype value and fall through to the
2020  // CAS-based locking scheme. Note that if our CAS fails, it means
2021  // that another thread raced us for the privilege of revoking the
2022  // bias of this particular object, so it's okay to continue in the
2023  // normal locking code.
2024  load_klass(temp_reg, obj_reg);
2025  ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2026  andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2027  orr(temp_reg, temp_reg, temp2_reg);
2028
2029  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2030
2031  // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2032  cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2033                 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2034                 /*where=*/obj_reg,
2035                 MacroAssembler::MemBarAcq,
2036                 MacroAssembler::cmpxchgx_hint_acquire_lock());
2037
2038  // reload markOop in mark_reg before continuing with lightweight locking
2039  ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2040
2041  // Fall through to the normal CAS-based lock, because no matter what
2042  // the result of the above CAS, some thread must have succeeded in
2043  // removing the bias bit from the object's header.
2044  if (PrintBiasedLockingStatistics) {
2045    Label l;
2046    bne(cr_reg, l);
2047    load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2048    lwzx(temp_reg, temp2_reg);
2049    addi(temp_reg, temp_reg, 1);
2050    stwx(temp_reg, temp2_reg);
2051    bind(l);
2052  }
2053
2054  bind(cas_label);
2055}
2056
2057void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2058  // Check for biased locking unlock case, which is a no-op
2059  // Note: we do not have to check the thread ID for two reasons.
2060  // First, the interpreter checks for IllegalMonitorStateException at
2061  // a higher level. Second, if the bias was revoked while we held the
2062  // lock, the object could not be rebiased toward another thread, so
2063  // the bias bit would be clear.
2064
2065  ld(temp_reg, 0, mark_addr);
2066  andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2067
2068  cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2069  beq(cr_reg, done);
2070}
2071
2072// allocation (for C1)
2073void MacroAssembler::eden_allocate(
2074  Register obj,                      // result: pointer to object after successful allocation
2075  Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2076  int      con_size_in_bytes,        // object size in bytes if   known at compile time
2077  Register t1,                       // temp register
2078  Register t2,                       // temp register
2079  Label&   slow_case                 // continuation point if fast allocation fails
2080) {
2081  b(slow_case);
2082}
2083
2084void MacroAssembler::tlab_allocate(
2085  Register obj,                      // result: pointer to object after successful allocation
2086  Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2087  int      con_size_in_bytes,        // object size in bytes if   known at compile time
2088  Register t1,                       // temp register
2089  Label&   slow_case                 // continuation point if fast allocation fails
2090) {
2091  // make sure arguments make sense
2092  assert_different_registers(obj, var_size_in_bytes, t1);
2093  assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
2094  assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2095
2096  const Register new_top = t1;
2097  //verify_tlab(); not implemented
2098
2099  ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2100  ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2101  if (var_size_in_bytes == noreg) {
2102    addi(new_top, obj, con_size_in_bytes);
2103  } else {
2104    add(new_top, obj, var_size_in_bytes);
2105  }
2106  cmpld(CCR0, new_top, R0);
2107  bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2108
2109#ifdef ASSERT
2110  // make sure new free pointer is properly aligned
2111  {
2112    Label L;
2113    andi_(R0, new_top, MinObjAlignmentInBytesMask);
2114    beq(CCR0, L);
2115    stop("updated TLAB free is not properly aligned", 0x934);
2116    bind(L);
2117  }
2118#endif // ASSERT
2119
2120  // update the tlab top pointer
2121  std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2122  //verify_tlab(); not implemented
2123}
2124void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) {
2125  unimplemented("tlab_refill");
2126}
2127void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2128  unimplemented("incr_allocated_bytes");
2129}
2130
2131address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2132                                             int insts_call_instruction_offset, Register Rtoc) {
2133  // Start the stub.
2134  address stub = start_a_stub(64);
2135  if (stub == NULL) { return NULL; } // CodeCache full: bail out
2136
2137  // Create a trampoline stub relocation which relates this trampoline stub
2138  // with the call instruction at insts_call_instruction_offset in the
2139  // instructions code-section.
2140  relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2141  const int stub_start_offset = offset();
2142
2143  // For java_to_interp stubs we use R11_scratch1 as scratch register
2144  // and in call trampoline stubs we use R12_scratch2. This way we
2145  // can distinguish them (see is_NativeCallTrampolineStub_at()).
2146  Register reg_scratch = R12_scratch2;
2147
2148  // Now, create the trampoline stub's code:
2149  // - load the TOC
2150  // - load the call target from the constant pool
2151  // - call
2152  if (Rtoc == noreg) {
2153    calculate_address_from_global_toc(reg_scratch, method_toc());
2154    Rtoc = reg_scratch;
2155  }
2156
2157  ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2158  mtctr(reg_scratch);
2159  bctr();
2160
2161  const address stub_start_addr = addr_at(stub_start_offset);
2162
2163  // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2164  assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2165         "encoded offset into the constant pool must match");
2166  // Trampoline_stub_size should be good.
2167  assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2168  assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2169
2170  // End the stub.
2171  end_a_stub();
2172  return stub;
2173}
2174
2175// TM on PPC64.
2176void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2177  Label retry;
2178  bind(retry);
2179  ldarx(result, addr, /*hint*/ false);
2180  addi(result, result, simm16);
2181  stdcx_(result, addr);
2182  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2183    bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2184  } else {
2185    bne(                  CCR0, retry); // stXcx_ sets CCR0
2186  }
2187}
2188
2189void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2190  Label retry;
2191  bind(retry);
2192  lwarx(result, addr, /*hint*/ false);
2193  ori(result, result, uimm16);
2194  stwcx_(result, addr);
2195  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2196    bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2197  } else {
2198    bne(                  CCR0, retry); // stXcx_ sets CCR0
2199  }
2200}
2201
2202#if INCLUDE_RTM_OPT
2203
2204// Update rtm_counters based on abort status
2205// input: abort_status
2206//        rtm_counters (RTMLockingCounters*)
2207void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2208  // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2209  // x86 ppc (! means inverted, ? means not the same)
2210  //  0   31  Set if abort caused by XABORT instruction.
2211  //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2212  //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2213  //  3   10  Set if an internal buffer overflowed.
2214  //  4  ?12  Set if a debug breakpoint was hit.
2215  //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2216  const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2217                                 Assembler::tm_failure_persistent, // inverted: transient
2218                                 Assembler::tm_trans_cf,
2219                                 Assembler::tm_footprint_of,
2220                                 Assembler::tm_non_trans_cf,
2221                                 Assembler::tm_suspended};
2222  const bool tm_failure_inv[] = {false, true, false, false, false, false};
2223  assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2224
2225  const Register addr_Reg = R0;
2226  // Keep track of offset to where rtm_counters_Reg had pointed to.
2227  int counters_offs = RTMLockingCounters::abort_count_offset();
2228  addi(addr_Reg, rtm_counters_Reg, counters_offs);
2229  const Register temp_Reg = rtm_counters_Reg;
2230
2231  //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2232  ldx(temp_Reg, addr_Reg);
2233  addi(temp_Reg, temp_Reg, 1);
2234  stdx(temp_Reg, addr_Reg);
2235
2236  if (PrintPreciseRTMLockingStatistics) {
2237    int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2238
2239    //mftexasr(abort_status); done by caller
2240    for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2241      counters_offs += counters_offs_delta;
2242      li(temp_Reg, counters_offs_delta); // can't use addi with R0
2243      add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2244      counters_offs_delta = sizeof(uintx);
2245
2246      Label check_abort;
2247      rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2248      if (tm_failure_inv[i]) {
2249        bne(CCR0, check_abort);
2250      } else {
2251        beq(CCR0, check_abort);
2252      }
2253      //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2254      ldx(temp_Reg, addr_Reg);
2255      addi(temp_Reg, temp_Reg, 1);
2256      stdx(temp_Reg, addr_Reg);
2257      bind(check_abort);
2258    }
2259  }
2260  li(temp_Reg, -counters_offs); // can't use addi with R0
2261  add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2262}
2263
2264// Branch if (random & (count-1) != 0), count is 2^n
2265// tmp and CR0 are killed
2266void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2267  mftb(tmp);
2268  andi_(tmp, tmp, count-1);
2269  bne(CCR0, brLabel);
2270}
2271
2272// Perform abort ratio calculation, set no_rtm bit if high ratio.
2273// input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2274void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2275                                                 RTMLockingCounters* rtm_counters,
2276                                                 Metadata* method_data) {
2277  Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2278
2279  if (RTMLockingCalculationDelay > 0) {
2280    // Delay calculation.
2281    ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2282    cmpdi(CCR0, rtm_counters_Reg, 0);
2283    beq(CCR0, L_done);
2284    load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2285  }
2286  // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2287  //   Aborted transactions = abort_count * 100
2288  //   All transactions = total_count *  RTMTotalCountIncrRate
2289  //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2290  ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2291  cmpdi(CCR0, R0, RTMAbortThreshold);
2292  blt(CCR0, L_check_always_rtm2);
2293  mulli(R0, R0, 100);
2294
2295  const Register tmpReg = rtm_counters_Reg;
2296  ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2297  mulli(tmpReg, tmpReg, RTMTotalCountIncrRate);
2298  mulli(tmpReg, tmpReg, RTMAbortRatio);
2299  cmpd(CCR0, R0, tmpReg);
2300  blt(CCR0, L_check_always_rtm1); // jump to reload
2301  if (method_data != NULL) {
2302    // Set rtm_state to "no rtm" in MDO.
2303    // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2304    // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2305    load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2306    atomic_ori_int(R0, tmpReg, NoRTM);
2307  }
2308  b(L_done);
2309
2310  bind(L_check_always_rtm1);
2311  load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2312  bind(L_check_always_rtm2);
2313  ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2314  cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
2315  blt(CCR0, L_done);
2316  if (method_data != NULL) {
2317    // Set rtm_state to "always rtm" in MDO.
2318    // Not using a metadata relocation. See above.
2319    load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2320    atomic_ori_int(R0, tmpReg, UseRTM);
2321  }
2322  bind(L_done);
2323}
2324
2325// Update counters and perform abort ratio calculation.
2326// input: abort_status_Reg
2327void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2328                                   RTMLockingCounters* rtm_counters,
2329                                   Metadata* method_data,
2330                                   bool profile_rtm) {
2331
2332  assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2333  // Update rtm counters based on state at abort.
2334  // Reads abort_status_Reg, updates flags.
2335  assert_different_registers(abort_status_Reg, temp_Reg);
2336  load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2337  rtm_counters_update(abort_status_Reg, temp_Reg);
2338  if (profile_rtm) {
2339    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2340    rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2341  }
2342}
2343
2344// Retry on abort if abort's status indicates non-persistent failure.
2345// inputs: retry_count_Reg
2346//       : abort_status_Reg
2347// output: retry_count_Reg decremented by 1
2348void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2349                                             Label& retryLabel, Label* checkRetry) {
2350  Label doneRetry;
2351  rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2352  bne(CCR0, doneRetry);
2353  if (checkRetry) { bind(*checkRetry); }
2354  addic_(retry_count_Reg, retry_count_Reg, -1);
2355  blt(CCR0, doneRetry);
2356  smt_yield(); // Can't use wait(). No permission (SIGILL).
2357  b(retryLabel);
2358  bind(doneRetry);
2359}
2360
2361// Spin and retry if lock is busy.
2362// inputs: box_Reg (monitor address)
2363//       : retry_count_Reg
2364// output: retry_count_Reg decremented by 1
2365// CTR is killed
2366void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2367  Label SpinLoop, doneRetry;
2368  addic_(retry_count_Reg, retry_count_Reg, -1);
2369  blt(CCR0, doneRetry);
2370  li(R0, RTMSpinLoopCount);
2371  mtctr(R0);
2372
2373  bind(SpinLoop);
2374  smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2375  bdz(retryLabel);
2376  ld(R0, 0, owner_addr_Reg);
2377  cmpdi(CCR0, R0, 0);
2378  bne(CCR0, SpinLoop);
2379  b(retryLabel);
2380
2381  bind(doneRetry);
2382}
2383
2384// Use RTM for normal stack locks.
2385// Input: objReg (object to lock)
2386void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2387                                       Register obj, Register mark_word, Register tmp,
2388                                       Register retry_on_abort_count_Reg,
2389                                       RTMLockingCounters* stack_rtm_counters,
2390                                       Metadata* method_data, bool profile_rtm,
2391                                       Label& DONE_LABEL, Label& IsInflated) {
2392  assert(UseRTMForStackLocks, "why call this otherwise?");
2393  assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2394  Label L_rtm_retry, L_decrement_retry, L_on_abort;
2395
2396  if (RTMRetryCount > 0) {
2397    load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2398    bind(L_rtm_retry);
2399  }
2400  andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2401  bne(CCR0, IsInflated);
2402
2403  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2404    Label L_noincrement;
2405    if (RTMTotalCountIncrRate > 1) {
2406      branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement);
2407    }
2408    assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2409    load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2410    //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2411    ldx(mark_word, tmp);
2412    addi(mark_word, mark_word, 1);
2413    stdx(mark_word, tmp);
2414    bind(L_noincrement);
2415  }
2416  tbegin_();
2417  beq(CCR0, L_on_abort);
2418  ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2419  andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2420  cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2421  beq(flag, DONE_LABEL);                                       // all done if unlocked
2422
2423  if (UseRTMXendForLockBusy) {
2424    tend_();
2425    b(L_decrement_retry);
2426  } else {
2427    tabort_();
2428  }
2429  bind(L_on_abort);
2430  const Register abort_status_Reg = tmp;
2431  mftexasr(abort_status_Reg);
2432  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2433    rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2434  }
2435  ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2436  if (RTMRetryCount > 0) {
2437    // Retry on lock abort if abort status is not permanent.
2438    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2439  } else {
2440    bind(L_decrement_retry);
2441  }
2442}
2443
2444// Use RTM for inflating locks
2445// inputs: obj       (object to lock)
2446//         mark_word (current header - KILLED)
2447//         boxReg    (on-stack box address (displaced header location) - KILLED)
2448void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2449                                          Register obj, Register mark_word, Register boxReg,
2450                                          Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2451                                          RTMLockingCounters* rtm_counters,
2452                                          Metadata* method_data, bool profile_rtm,
2453                                          Label& DONE_LABEL) {
2454  assert(UseRTMLocking, "why call this otherwise?");
2455  Label L_rtm_retry, L_decrement_retry, L_on_abort;
2456  // Clean monitor_value bit to get valid pointer.
2457  int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2458
2459  // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2460  std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2461  const Register tmpReg = boxReg;
2462  const Register owner_addr_Reg = mark_word;
2463  addi(owner_addr_Reg, mark_word, owner_offset);
2464
2465  if (RTMRetryCount > 0) {
2466    load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2467    load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2468    bind(L_rtm_retry);
2469  }
2470  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2471    Label L_noincrement;
2472    if (RTMTotalCountIncrRate > 1) {
2473      branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement);
2474    }
2475    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2476    load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2477    //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2478    ldx(tmpReg, R0);
2479    addi(tmpReg, tmpReg, 1);
2480    stdx(tmpReg, R0);
2481    bind(L_noincrement);
2482  }
2483  tbegin_();
2484  beq(CCR0, L_on_abort);
2485  // We don't reload mark word. Will only be reset at safepoint.
2486  ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2487  cmpdi(flag, R0, 0);
2488  beq(flag, DONE_LABEL);
2489
2490  if (UseRTMXendForLockBusy) {
2491    tend_();
2492    b(L_decrement_retry);
2493  } else {
2494    tabort_();
2495  }
2496  bind(L_on_abort);
2497  const Register abort_status_Reg = tmpReg;
2498  mftexasr(abort_status_Reg);
2499  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2500    rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2501    // Restore owner_addr_Reg
2502    ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2503#ifdef ASSERT
2504    andi_(R0, mark_word, markOopDesc::monitor_value);
2505    asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2506#endif
2507    addi(owner_addr_Reg, mark_word, owner_offset);
2508  }
2509  if (RTMRetryCount > 0) {
2510    // Retry on lock abort if abort status is not permanent.
2511    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2512  }
2513
2514  // Appears unlocked - try to swing _owner from null to non-null.
2515  cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2516           MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2517           MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2518
2519  if (RTMRetryCount > 0) {
2520    // success done else retry
2521    b(DONE_LABEL);
2522    bind(L_decrement_retry);
2523    // Spin and retry if lock is busy.
2524    rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2525  } else {
2526    bind(L_decrement_retry);
2527  }
2528}
2529
2530#endif //  INCLUDE_RTM_OPT
2531
2532// "The box" is the space on the stack where we copy the object mark.
2533void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2534                                               Register temp, Register displaced_header, Register current_header,
2535                                               bool try_bias,
2536                                               RTMLockingCounters* rtm_counters,
2537                                               RTMLockingCounters* stack_rtm_counters,
2538                                               Metadata* method_data,
2539                                               bool use_rtm, bool profile_rtm) {
2540  assert_different_registers(oop, box, temp, displaced_header, current_header);
2541  assert(flag != CCR0, "bad condition register");
2542  Label cont;
2543  Label object_has_monitor;
2544  Label cas_failed;
2545
2546  // Load markOop from object into displaced_header.
2547  ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2548
2549
2550  // Always do locking in runtime.
2551  if (EmitSync & 0x01) {
2552    cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2553    return;
2554  }
2555
2556  if (try_bias) {
2557    biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2558  }
2559
2560#if INCLUDE_RTM_OPT
2561  if (UseRTMForStackLocks && use_rtm) {
2562    rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2563                      stack_rtm_counters, method_data, profile_rtm,
2564                      cont, object_has_monitor);
2565  }
2566#endif // INCLUDE_RTM_OPT
2567
2568  // Handle existing monitor.
2569  if ((EmitSync & 0x02) == 0) {
2570    // The object has an existing monitor iff (mark & monitor_value) != 0.
2571    andi_(temp, displaced_header, markOopDesc::monitor_value);
2572    bne(CCR0, object_has_monitor);
2573  }
2574
2575  // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2576  ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2577
2578  // Load Compare Value application register.
2579
2580  // Initialize the box. (Must happen before we update the object mark!)
2581  std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2582
2583  // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2584  // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2585  cmpxchgd(/*flag=*/flag,
2586           /*current_value=*/current_header,
2587           /*compare_value=*/displaced_header,
2588           /*exchange_value=*/box,
2589           /*where=*/oop,
2590           MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2591           MacroAssembler::cmpxchgx_hint_acquire_lock(),
2592           noreg,
2593           &cas_failed,
2594           /*check without membar and ldarx first*/true);
2595  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2596
2597  // If the compare-and-exchange succeeded, then we found an unlocked
2598  // object and we have now locked it.
2599  b(cont);
2600
2601  bind(cas_failed);
2602  // We did not see an unlocked object so try the fast recursive case.
2603
2604  // Check if the owner is self by comparing the value in the markOop of object
2605  // (current_header) with the stack pointer.
2606  sub(current_header, current_header, R1_SP);
2607  load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2608
2609  and_(R0/*==0?*/, current_header, temp);
2610  // If condition is true we are cont and hence we can store 0 as the
2611  // displaced header in the box, which indicates that it is a recursive lock.
2612  mcrf(flag,CCR0);
2613  std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2614
2615  // Handle existing monitor.
2616  if ((EmitSync & 0x02) == 0) {
2617    b(cont);
2618
2619    bind(object_has_monitor);
2620    // The object's monitor m is unlocked iff m->owner == NULL,
2621    // otherwise m->owner may contain a thread or a stack address.
2622
2623#if INCLUDE_RTM_OPT
2624    // Use the same RTM locking code in 32- and 64-bit VM.
2625    if (use_rtm) {
2626      rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2627                           rtm_counters, method_data, profile_rtm, cont);
2628    } else {
2629#endif // INCLUDE_RTM_OPT
2630
2631    // Try to CAS m->owner from NULL to current thread.
2632    addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2633    cmpxchgd(/*flag=*/flag,
2634             /*current_value=*/current_header,
2635             /*compare_value=*/(intptr_t)0,
2636             /*exchange_value=*/R16_thread,
2637             /*where=*/temp,
2638             MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2639             MacroAssembler::cmpxchgx_hint_acquire_lock());
2640
2641    // Store a non-null value into the box.
2642    std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2643
2644#   ifdef ASSERT
2645    bne(flag, cont);
2646    // We have acquired the monitor, check some invariants.
2647    addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2648    // Invariant 1: _recursions should be 0.
2649    //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2650    asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2651                            "monitor->_recursions should be 0", -1);
2652    // Invariant 2: OwnerIsThread shouldn't be 0.
2653    //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
2654    //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
2655    //                           "monitor->OwnerIsThread shouldn't be 0", -1);
2656#   endif
2657
2658#if INCLUDE_RTM_OPT
2659    } // use_rtm()
2660#endif
2661  }
2662
2663  bind(cont);
2664  // flag == EQ indicates success
2665  // flag == NE indicates failure
2666}
2667
2668void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2669                                                 Register temp, Register displaced_header, Register current_header,
2670                                                 bool try_bias, bool use_rtm) {
2671  assert_different_registers(oop, box, temp, displaced_header, current_header);
2672  assert(flag != CCR0, "bad condition register");
2673  Label cont;
2674  Label object_has_monitor;
2675
2676  // Always do locking in runtime.
2677  if (EmitSync & 0x01) {
2678    cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2679    return;
2680  }
2681
2682  if (try_bias) {
2683    biased_locking_exit(flag, oop, current_header, cont);
2684  }
2685
2686#if INCLUDE_RTM_OPT
2687  if (UseRTMForStackLocks && use_rtm) {
2688    assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2689    Label L_regular_unlock;
2690    ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2691    andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2692    cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2693    bne(flag, L_regular_unlock);                                      // else RegularLock
2694    tend_();                                                          // otherwise end...
2695    b(cont);                                                          // ... and we're done
2696    bind(L_regular_unlock);
2697  }
2698#endif
2699
2700  // Find the lock address and load the displaced header from the stack.
2701  ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2702
2703  // If the displaced header is 0, we have a recursive unlock.
2704  cmpdi(flag, displaced_header, 0);
2705  beq(flag, cont);
2706
2707  // Handle existing monitor.
2708  if ((EmitSync & 0x02) == 0) {
2709    // The object has an existing monitor iff (mark & monitor_value) != 0.
2710    RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2711    ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2712    andi_(R0, current_header, markOopDesc::monitor_value);
2713    bne(CCR0, object_has_monitor);
2714  }
2715
2716  // Check if it is still a light weight lock, this is is true if we see
2717  // the stack address of the basicLock in the markOop of the object.
2718  // Cmpxchg sets flag to cmpd(current_header, box).
2719  cmpxchgd(/*flag=*/flag,
2720           /*current_value=*/current_header,
2721           /*compare_value=*/box,
2722           /*exchange_value=*/displaced_header,
2723           /*where=*/oop,
2724           MacroAssembler::MemBarRel,
2725           MacroAssembler::cmpxchgx_hint_release_lock(),
2726           noreg,
2727           &cont);
2728
2729  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2730
2731  // Handle existing monitor.
2732  if ((EmitSync & 0x02) == 0) {
2733    b(cont);
2734
2735    bind(object_has_monitor);
2736    addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2737    ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2738
2739    // It's inflated.
2740#if INCLUDE_RTM_OPT
2741    if (use_rtm) {
2742      Label L_regular_inflated_unlock;
2743      // Clean monitor_value bit to get valid pointer
2744      cmpdi(flag, temp, 0);
2745      bne(flag, L_regular_inflated_unlock);
2746      tend_();
2747      b(cont);
2748      bind(L_regular_inflated_unlock);
2749    }
2750#endif
2751
2752    ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2753    xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2754    orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2755    cmpdi(flag, temp, 0);
2756    bne(flag, cont);
2757
2758    ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2759    ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2760    orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2761    cmpdi(flag, temp, 0);
2762    bne(flag, cont);
2763    release();
2764    std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2765  }
2766
2767  bind(cont);
2768  // flag == EQ indicates success
2769  // flag == NE indicates failure
2770}
2771
2772// Write serialization page so VM thread can do a pseudo remote membar.
2773// We use the current thread pointer to calculate a thread specific
2774// offset to write to within the page. This minimizes bus traffic
2775// due to cache line collision.
2776void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
2777  srdi(tmp2, thread, os::get_serialize_page_shift_count());
2778
2779  int mask = os::vm_page_size() - sizeof(int);
2780  if (Assembler::is_simm(mask, 16)) {
2781    andi(tmp2, tmp2, mask);
2782  } else {
2783    lis(tmp1, (int)((signed short) (mask >> 16)));
2784    ori(tmp1, tmp1, mask & 0x0000ffff);
2785    andr(tmp2, tmp2, tmp1);
2786  }
2787
2788  load_const(tmp1, (long) os::get_memory_serialize_page());
2789  release();
2790  stwx(R0, tmp1, tmp2);
2791}
2792
2793
2794// GC barrier helper macros
2795
2796// Write the card table byte if needed.
2797void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
2798  CardTableModRefBS* bs =
2799    barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
2800  assert(bs->kind() == BarrierSet::CardTableForRS ||
2801         bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
2802#ifdef ASSERT
2803  cmpdi(CCR0, Rnew_val, 0);
2804  asm_assert_ne("null oop not allowed", 0x321);
2805#endif
2806  card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
2807}
2808
2809// Write the card table byte.
2810void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
2811  assert_different_registers(Robj, Rtmp, R0);
2812  load_const_optimized(Rtmp, (address)byte_map_base, R0);
2813  srdi(Robj, Robj, CardTableModRefBS::card_shift);
2814  li(R0, 0); // dirty
2815  if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
2816  stbx(R0, Rtmp, Robj);
2817}
2818
2819#if INCLUDE_ALL_GCS
2820// General G1 pre-barrier generator.
2821// Goal: record the previous value if it is not null.
2822void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
2823                                          Register Rtmp1, Register Rtmp2, bool needs_frame) {
2824  Label runtime, filtered;
2825
2826  // Is marking active?
2827  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
2828    lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
2829  } else {
2830    guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
2831    lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
2832  }
2833  cmpdi(CCR0, Rtmp1, 0);
2834  beq(CCR0, filtered);
2835
2836  // Do we need to load the previous value?
2837  if (Robj != noreg) {
2838    // Load the previous value...
2839    if (UseCompressedOops) {
2840      lwz(Rpre_val, offset, Robj);
2841    } else {
2842      ld(Rpre_val, offset, Robj);
2843    }
2844    // Previous value has been loaded into Rpre_val.
2845  }
2846  assert(Rpre_val != noreg, "must have a real register");
2847
2848  // Is the previous value null?
2849  cmpdi(CCR0, Rpre_val, 0);
2850  beq(CCR0, filtered);
2851
2852  if (Robj != noreg && UseCompressedOops) {
2853    decode_heap_oop_not_null(Rpre_val);
2854  }
2855
2856  // OK, it's not filtered, so we'll need to call enqueue. In the normal
2857  // case, pre_val will be a scratch G-reg, but there are some cases in
2858  // which it's an O-reg. In the first case, do a normal call. In the
2859  // latter, do a save here and call the frameless version.
2860
2861  // Can we store original value in the thread's buffer?
2862  // Is index == 0?
2863  // (The index field is typed as size_t.)
2864  const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
2865
2866  ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
2867  cmpdi(CCR0, Rindex, 0);
2868  beq(CCR0, runtime); // If index == 0, goto runtime.
2869  ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread);
2870
2871  addi(Rindex, Rindex, -wordSize); // Decrement index.
2872  std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
2873
2874  // Record the previous value.
2875  stdx(Rpre_val, Rbuffer, Rindex);
2876  b(filtered);
2877
2878  bind(runtime);
2879
2880  // VM call need frame to access(write) O register.
2881  if (needs_frame) {
2882    save_LR_CR(Rtmp1);
2883    push_frame_reg_args(0, Rtmp2);
2884  }
2885
2886  if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
2887  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
2888  if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
2889
2890  if (needs_frame) {
2891    pop_frame();
2892    restore_LR_CR(Rtmp1);
2893  }
2894
2895  bind(filtered);
2896}
2897
2898// General G1 post-barrier generator
2899// Store cross-region card.
2900void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
2901  Label runtime, filtered_int;
2902  Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
2903  assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
2904
2905  G1SATBCardTableLoggingModRefBS* bs =
2906    barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
2907
2908  // Does store cross heap regions?
2909  if (G1RSBarrierRegionFilter) {
2910    xorr(Rtmp1, Rstore_addr, Rnew_val);
2911    srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
2912    beq(CCR0, filtered);
2913  }
2914
2915  // Crosses regions, storing NULL?
2916#ifdef ASSERT
2917  cmpdi(CCR0, Rnew_val, 0);
2918  asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
2919  //beq(CCR0, filtered);
2920#endif
2921
2922  // Storing region crossing non-NULL, is card already dirty?
2923  assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
2924  const Register Rcard_addr = Rtmp1;
2925  Register Rbase = Rtmp2;
2926  load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
2927
2928  srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
2929
2930  // Get the address of the card.
2931  lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
2932  cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
2933  beq(CCR0, filtered);
2934
2935  membar(Assembler::StoreLoad);
2936  lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
2937  cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
2938  beq(CCR0, filtered);
2939
2940  // Storing a region crossing, non-NULL oop, card is clean.
2941  // Dirty card and log.
2942  li(Rtmp3, CardTableModRefBS::dirty_card_val());
2943  //release(); // G1: oops are allowed to get visible after dirty marking.
2944  stbx(Rtmp3, Rbase, Rcard_addr);
2945
2946  add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
2947  Rbase = noreg; // end of lifetime
2948
2949  const Register Rqueue_index = Rtmp2,
2950                 Rqueue_buf   = Rtmp3;
2951  ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
2952  cmpdi(CCR0, Rqueue_index, 0);
2953  beq(CCR0, runtime); // index == 0 then jump to runtime
2954  ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread);
2955
2956  addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
2957  std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
2958
2959  stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
2960  b(filtered);
2961
2962  bind(runtime);
2963
2964  // Save the live input values.
2965  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
2966
2967  bind(filtered_int);
2968}
2969#endif // INCLUDE_ALL_GCS
2970
2971// Values for last_Java_pc, and last_Java_sp must comply to the rules
2972// in frame_ppc.hpp.
2973void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2974  // Always set last_Java_pc and flags first because once last_Java_sp
2975  // is visible has_last_Java_frame is true and users will look at the
2976  // rest of the fields. (Note: flags should always be zero before we
2977  // get here so doesn't need to be set.)
2978
2979  // Verify that last_Java_pc was zeroed on return to Java
2980  asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2981                          "last_Java_pc not zeroed before leaving Java", 0x200);
2982
2983  // When returning from calling out from Java mode the frame anchor's
2984  // last_Java_pc will always be set to NULL. It is set here so that
2985  // if we are doing a call to native (not VM) that we capture the
2986  // known pc and don't have to rely on the native call having a
2987  // standard frame linkage where we can find the pc.
2988  if (last_Java_pc != noreg)
2989    std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2990
2991  // Set last_Java_sp last.
2992  std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2993}
2994
2995void MacroAssembler::reset_last_Java_frame(void) {
2996  asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2997                             R16_thread, "SP was not set, still zero", 0x202);
2998
2999  BLOCK_COMMENT("reset_last_Java_frame {");
3000  li(R0, 0);
3001
3002  // _last_Java_sp = 0
3003  std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3004
3005  // _last_Java_pc = 0
3006  std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3007  BLOCK_COMMENT("} reset_last_Java_frame");
3008}
3009
3010void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3011  assert_different_registers(sp, tmp1);
3012
3013  // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3014  // TOP_IJAVA_FRAME_ABI.
3015  // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3016  address entry = pc();
3017  load_const_optimized(tmp1, entry);
3018
3019  set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3020}
3021
3022void MacroAssembler::get_vm_result(Register oop_result) {
3023  // Read:
3024  //   R16_thread
3025  //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3026  //
3027  // Updated:
3028  //   oop_result
3029  //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3030
3031  verify_thread();
3032
3033  ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3034  li(R0, 0);
3035  std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3036
3037  verify_oop(oop_result);
3038}
3039
3040void MacroAssembler::get_vm_result_2(Register metadata_result) {
3041  // Read:
3042  //   R16_thread
3043  //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3044  //
3045  // Updated:
3046  //   metadata_result
3047  //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3048
3049  ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3050  li(R0, 0);
3051  std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3052}
3053
3054Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3055  Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3056  if (Universe::narrow_klass_base() != 0) {
3057    // Use dst as temp if it is free.
3058    sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3059    current = dst;
3060  }
3061  if (Universe::narrow_klass_shift() != 0) {
3062    srdi(dst, current, Universe::narrow_klass_shift());
3063    current = dst;
3064  }
3065  return current;
3066}
3067
3068void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3069  if (UseCompressedClassPointers) {
3070    Register compressedKlass = encode_klass_not_null(ck, klass);
3071    stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3072  } else {
3073    std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3074  }
3075}
3076
3077void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3078  if (UseCompressedClassPointers) {
3079    if (val == noreg) {
3080      val = R0;
3081      li(val, 0);
3082    }
3083    stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3084  }
3085}
3086
3087int MacroAssembler::instr_size_for_decode_klass_not_null() {
3088  if (!UseCompressedClassPointers) return 0;
3089  int num_instrs = 1;  // shift or move
3090  if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
3091  return num_instrs * BytesPerInstWord;
3092}
3093
3094void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3095  assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3096  if (src == noreg) src = dst;
3097  Register shifted_src = src;
3098  if (Universe::narrow_klass_shift() != 0 ||
3099      Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
3100    shifted_src = dst;
3101    sldi(shifted_src, src, Universe::narrow_klass_shift());
3102  }
3103  if (Universe::narrow_klass_base() != 0) {
3104    add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3105  }
3106}
3107
3108void MacroAssembler::load_klass(Register dst, Register src) {
3109  if (UseCompressedClassPointers) {
3110    lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3111    // Attention: no null check here!
3112    decode_klass_not_null(dst, dst);
3113  } else {
3114    ld(dst, oopDesc::klass_offset_in_bytes(), src);
3115  }
3116}
3117
3118// Clear Array
3119// Kills both input registers. tmp == R0 is allowed.
3120void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) {
3121  // Procedure for large arrays (uses data cache block zero instruction).
3122    Label startloop, fast, fastloop, small_rest, restloop, done;
3123    const int cl_size         = VM_Version::L1_data_cache_line_size(),
3124              cl_dwords       = cl_size>>3,
3125              cl_dw_addr_bits = exact_log2(cl_dwords),
3126              dcbz_min        = 1;                     // Min count of dcbz executions, needs to be >0.
3127
3128//2:
3129    cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included).
3130    blt(CCR1, small_rest);                                      // Too small.
3131    rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits);           // Extract dword offset within first cache line.
3132    beq(CCR0, fast);                                            // Already 128byte aligned.
3133
3134    subfic(tmp, tmp, cl_dwords);
3135    mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3136    subf(cnt_dwords, tmp, cnt_dwords); // rest.
3137    li(tmp, 0);
3138//10:
3139  bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3140    std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3141    addi(base_ptr, base_ptr, 8);
3142    bdnz(startloop);
3143//13:
3144  bind(fast);                                  // Clear 128byte blocks.
3145    srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3146    andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3147    mtctr(tmp);                                // Load counter.
3148//16:
3149  bind(fastloop);
3150    dcbz(base_ptr);                    // Clear 128byte aligned block.
3151    addi(base_ptr, base_ptr, cl_size);
3152    bdnz(fastloop);
3153    if (InsertEndGroupPPC64) { endgroup(); } else { nop(); }
3154//20:
3155  bind(small_rest);
3156    cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3157    beq(CCR0, done);                   // rest == 0
3158    li(tmp, 0);
3159    mtctr(cnt_dwords);                 // Load counter.
3160//24:
3161  bind(restloop);                      // Clear rest.
3162    std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3163    addi(base_ptr, base_ptr, 8);
3164    bdnz(restloop);
3165//27:
3166  bind(done);
3167}
3168
3169/////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3170
3171// Search for a single jchar in an jchar[].
3172//
3173// Assumes that result differs from all other registers.
3174//
3175// Haystack, needle are the addresses of jchar-arrays.
3176// NeedleChar is needle[0] if it is known at compile time.
3177// Haycnt is the length of the haystack. We assume haycnt >=1.
3178//
3179// Preserves haystack, haycnt, kills all other registers.
3180//
3181// If needle == R0, we search for the constant needleChar.
3182void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt,
3183                                      Register needle, jchar needleChar,
3184                                      Register tmp1, Register tmp2) {
3185
3186  assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2);
3187
3188  Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End;
3189  Register needle0 = needle, // Contains needle[0].
3190           addr = tmp1,
3191           ch1 = tmp2,
3192           ch2 = R0;
3193
3194//2 (variable) or 3 (const):
3195   if (needle != R0) lhz(needle0, 0, needle); // Preload needle character, needle has len==1.
3196   dcbtct(haystack, 0x00);                        // Indicate R/O access to haystack.
3197
3198   srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3199   mr(addr, haystack);
3200   beq(CCR0, L_FinalCheck);
3201   mtctr(tmp2);              // Move to count register.
3202//8:
3203  bind(L_InnerLoop);             // Main work horse (2x unrolled search loop).
3204   lhz(ch1, 0, addr);        // Load characters from haystack.
3205   lhz(ch2, 2, addr);
3206   (needle != R0) ? cmpw(CCR0, ch1, needle0) : cmplwi(CCR0, ch1, needleChar);
3207   (needle != R0) ? cmpw(CCR1, ch2, needle0) : cmplwi(CCR1, ch2, needleChar);
3208   beq(CCR0, L_Found1);   // Did we find the needle?
3209   beq(CCR1, L_Found2);
3210   addi(addr, addr, 4);
3211   bdnz(L_InnerLoop);
3212//16:
3213  bind(L_FinalCheck);
3214   andi_(R0, haycnt, 1);
3215   beq(CCR0, L_NotFound);
3216   lhz(ch1, 0, addr);        // One position left at which we have to compare.
3217   (needle != R0) ? cmpw(CCR1, ch1, needle0) : cmplwi(CCR1, ch1, needleChar);
3218   beq(CCR1, L_Found3);
3219//21:
3220  bind(L_NotFound);
3221   li(result, -1);           // Not found.
3222   b(L_End);
3223
3224  bind(L_Found2);
3225   addi(addr, addr, 2);
3226//24:
3227  bind(L_Found1);
3228  bind(L_Found3);                  // Return index ...
3229   subf(addr, haystack, addr); // relative to haystack,
3230   srdi(result, addr, 1);      // in characters.
3231  bind(L_End);
3232}
3233
3234
3235// Implementation of IndexOf for jchar arrays.
3236//
3237// The length of haystack and needle are not constant, i.e. passed in a register.
3238//
3239// Preserves registers haystack, needle.
3240// Kills registers haycnt, needlecnt.
3241// Assumes that result differs from all other registers.
3242// Haystack, needle are the addresses of jchar-arrays.
3243// Haycnt, needlecnt are the lengths of them, respectively.
3244//
3245// Needlecntval must be zero or 15-bit unsigned immediate and > 1.
3246void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3247                                    Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3248                                    Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
3249
3250  // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3251  Label L_TooShort, L_Found, L_NotFound, L_End;
3252  Register last_addr = haycnt, // Kill haycnt at the beginning.
3253           addr      = tmp1,
3254           n_start   = tmp2,
3255           ch1       = tmp3,
3256           ch2       = R0;
3257
3258  // **************************************************************************************************
3259  // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3260  // **************************************************************************************************
3261
3262//1 (variable) or 3 (const):
3263   dcbtct(needle, 0x00);    // Indicate R/O access to str1.
3264   dcbtct(haystack, 0x00);  // Indicate R/O access to str2.
3265
3266  // Compute last haystack addr to use if no match gets found.
3267  if (needlecntval == 0) { // variable needlecnt
3268//3:
3269   subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3270   addi(addr, haystack, -2);          // Accesses use pre-increment.
3271   cmpwi(CCR6, needlecnt, 2);
3272   blt(CCR6, L_TooShort);          // Variable needlecnt: handle short needle separately.
3273   slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3274   lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3275   add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3276   addi(needlecnt, needlecnt, -2);    // Rest of needle.
3277  } else { // constant needlecnt
3278  guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3279  assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3280//5:
3281   addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3282   lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3283   addi(addr, haystack, -2);          // Accesses use pre-increment.
3284   slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3285   add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3286   li(needlecnt, needlecntval-2);     // Rest of needle.
3287  }
3288
3289  // Main Loop (now we have at least 3 characters).
3290//11:
3291  Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3;
3292  bind(L_OuterLoop); // Search for 1st 2 characters.
3293  Register addr_diff = tmp4;
3294   subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check.
3295   addi(addr, addr, 2);              // This is the new address we want to use for comparing.
3296   srdi_(ch2, addr_diff, 2);
3297   beq(CCR0, L_FinalCheck);       // 2 characters left?
3298   mtctr(ch2);                       // addr_diff/4
3299//16:
3300  bind(L_InnerLoop);                // Main work horse (2x unrolled search loop)
3301   lwz(ch1, 0, addr);           // Load 2 characters of haystack (ignore alignment).
3302   lwz(ch2, 2, addr);
3303   cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3304   cmpw(CCR1, ch2, n_start);
3305   beq(CCR0, L_Comp1);       // Did we find the needle start?
3306   beq(CCR1, L_Comp2);
3307   addi(addr, addr, 4);
3308   bdnz(L_InnerLoop);
3309//24:
3310  bind(L_FinalCheck);
3311   rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1.
3312   beq(CCR0, L_NotFound);
3313   lwz(ch1, 0, addr);                       // One position left at which we have to compare.
3314   cmpw(CCR1, ch1, n_start);
3315   beq(CCR1, L_Comp3);
3316//29:
3317  bind(L_NotFound);
3318   li(result, -1); // not found
3319   b(L_End);
3320
3321
3322   // **************************************************************************************************
3323   // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3324   // **************************************************************************************************
3325//31:
3326 if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size.
3327  int nopcnt = 5;
3328  if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below).
3329  if (needlecntval == 0) {         // We have to handle these cases separately.
3330  Label L_OneCharLoop;
3331  bind(L_TooShort);
3332   mtctr(haycnt);
3333   lhz(n_start, 0, needle);    // First character of needle
3334  bind(L_OneCharLoop);
3335   lhzu(ch1, 2, addr);
3336   cmpw(CCR1, ch1, n_start);
3337   beq(CCR1, L_Found);      // Did we find the one character needle?
3338   bdnz(L_OneCharLoop);
3339   li(result, -1);             // Not found.
3340   b(L_End);
3341  } // 8 instructions, so no impact on alignment.
3342  for (int x = 0; x < nopcnt; ++x) nop();
3343 }
3344
3345  // **************************************************************************************************
3346  // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3347  // **************************************************************************************************
3348
3349  // Compare the rest
3350//36 if needlecntval==0, else 37:
3351  bind(L_Comp2);
3352   addi(addr, addr, 2); // First comparison has failed, 2nd one hit.
3353  bind(L_Comp1);            // Addr points to possible needle start.
3354  bind(L_Comp3);            // Could have created a copy and use a different return address but saving code size here.
3355  if (needlecntval != 2) {  // Const needlecnt==2?
3356   if (needlecntval != 3) {
3357    if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2?
3358    Register ind_reg = tmp4;
3359    li(ind_reg, 2*2);   // First 2 characters are already compared, use index 2.
3360    mtctr(needlecnt);   // Decremented by 2, still > 0.
3361//40:
3362   Label L_CompLoop;
3363   bind(L_CompLoop);
3364    lhzx(ch2, needle, ind_reg);
3365    lhzx(ch1, addr, ind_reg);
3366    cmpw(CCR1, ch1, ch2);
3367    bne(CCR1, L_OuterLoop);
3368    addi(ind_reg, ind_reg, 2);
3369    bdnz(L_CompLoop);
3370   } else { // No loop required if there's only one needle character left.
3371    lhz(ch2, 2*2, needle);
3372    lhz(ch1, 2*2, addr);
3373    cmpw(CCR1, ch1, ch2);
3374    bne(CCR1, L_OuterLoop);
3375   }
3376  }
3377  // Return index ...
3378//46:
3379  bind(L_Found);
3380   subf(addr, haystack, addr); // relative to haystack, ...
3381   srdi(result, addr, 1);      // in characters.
3382//48:
3383  bind(L_End);
3384}
3385
3386// Implementation of Compare for jchar arrays.
3387//
3388// Kills the registers str1, str2, cnt1, cnt2.
3389// Kills cr0, ctr.
3390// Assumes that result differes from the input registers.
3391void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg,
3392                                    Register result_reg, Register tmp_reg) {
3393   assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg);
3394
3395   Label Ldone, Lslow_case, Lslow_loop, Lfast_loop;
3396   Register cnt_diff = R0,
3397            limit_reg = cnt1_reg,
3398            chr1_reg = result_reg,
3399            chr2_reg = cnt2_reg,
3400            addr_diff = str2_reg;
3401
3402   // Offset 0 should be 32 byte aligned.
3403//-4:
3404    dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3405    dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3406//-2:
3407   // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters).
3408    subf(result_reg, cnt2_reg, cnt1_reg);  // difference between cnt1/2
3409    subf_(addr_diff, str1_reg, str2_reg);  // alias?
3410    beq(CCR0, Ldone);                   // return cnt difference if both ones are identical
3411    srawi(limit_reg, result_reg, 31);      // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow)
3412    mr(cnt_diff, result_reg);
3413    andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0
3414    add_(limit_reg, cnt2_reg, limit_reg);  // min(cnt1, cnt2)==0?
3415    beq(CCR0, Ldone);                   // return cnt difference if one has 0 length
3416
3417    lhz(chr1_reg, 0, str1_reg);            // optional: early out if first characters mismatch
3418    lhzx(chr2_reg, str1_reg, addr_diff);   // optional: early out if first characters mismatch
3419    addi(tmp_reg, limit_reg, -1);          // min(cnt1, cnt2)-1
3420    subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch
3421    bne(CCR0, Ldone);                   // optional: early out if first characters mismatch
3422
3423   // Set loop counter by scaling down tmp_reg
3424    srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4
3425    ble(CCR0, Lslow_case);                 // need >4 characters for fast loop
3426    andi(limit_reg, tmp_reg, 4-1);            // remaining characters
3427
3428   // Adapt str1_reg str2_reg for the first loop iteration
3429    mtctr(chr2_reg);                 // (min(cnt1, cnt2)-1)/4
3430    addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop
3431//16:
3432   // Compare the rest of the characters
3433   bind(Lfast_loop);
3434    ld(chr1_reg, 0, str1_reg);
3435    ldx(chr2_reg, str1_reg, addr_diff);
3436    cmpd(CCR0, chr2_reg, chr1_reg);
3437    bne(CCR0, Lslow_case); // return chr1_reg
3438    addi(str1_reg, str1_reg, 4*2);
3439    bdnz(Lfast_loop);
3440    addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing
3441//23:
3442   bind(Lslow_case);
3443    mtctr(limit_reg);
3444//24:
3445   bind(Lslow_loop);
3446    lhz(chr1_reg, 0, str1_reg);
3447    lhzx(chr2_reg, str1_reg, addr_diff);
3448    subf_(result_reg, chr2_reg, chr1_reg);
3449    bne(CCR0, Ldone); // return chr1_reg
3450    addi(str1_reg, str1_reg, 1*2);
3451    bdnz(Lslow_loop);
3452//30:
3453   // If strings are equal up to min length, return the length difference.
3454    mr(result_reg, cnt_diff);
3455    nop(); // alignment
3456//32:
3457   // Otherwise, return the difference between the first mismatched chars.
3458   bind(Ldone);
3459}
3460
3461
3462// Compare char[] arrays.
3463//
3464// str1_reg   USE only
3465// str2_reg   USE only
3466// cnt_reg    USE_DEF, due to tmp reg shortage
3467// result_reg DEF only, might compromise USE only registers
3468void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg,
3469                                        Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg,
3470                                        Register tmp5_reg) {
3471
3472  // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3473  assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3474  assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3475
3476  // Offset 0 should be 32 byte aligned.
3477  Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false;
3478  Register index_reg = tmp5_reg;
3479  Register cbc_iter  = tmp4_reg;
3480
3481//-1:
3482  dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3483  dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3484//1:
3485  andi(cbc_iter, cnt_reg, 4-1);            // Remaining iterations after 4 java characters per iteration loop.
3486  li(index_reg, 0); // init
3487  li(result_reg, 0); // assume false
3488  srwi_(tmp2_reg, cnt_reg, exact_log2(4)); // Div: 4 java characters per iteration (main loop).
3489
3490  cmpwi(CCR1, cbc_iter, 0);             // CCR1 = (cbc_iter==0)
3491  beq(CCR0, Linit_cbc);                 // too short
3492    mtctr(tmp2_reg);
3493//8:
3494    bind(Lloop);
3495      ldx(tmp1_reg, str1_reg, index_reg);
3496      ldx(tmp2_reg, str2_reg, index_reg);
3497      cmpd(CCR0, tmp1_reg, tmp2_reg);
3498      bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3499      addi(index_reg, index_reg, 4*sizeof(jchar));
3500      bdnz(Lloop);
3501//14:
3502  bind(Linit_cbc);
3503  beq(CCR1, Ldone_true);
3504    mtctr(cbc_iter);
3505//16:
3506    bind(Lcbc);
3507      lhzx(tmp1_reg, str1_reg, index_reg);
3508      lhzx(tmp2_reg, str2_reg, index_reg);
3509      cmpw(CCR0, tmp1_reg, tmp2_reg);
3510      bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3511      addi(index_reg, index_reg, 1*sizeof(jchar));
3512      bdnz(Lcbc);
3513    nop();
3514  bind(Ldone_true);
3515  li(result_reg, 1);
3516//24:
3517  bind(Ldone_false);
3518}
3519
3520
3521void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,
3522                                           Register tmp1_reg, Register tmp2_reg) {
3523  // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3524  assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg);
3525  assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg);
3526  assert(sizeof(jchar) == 2, "must be");
3527  assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate");
3528
3529  Label Ldone_false;
3530
3531  if (cntval < 16) { // short case
3532    if (cntval != 0) li(result_reg, 0); // assume false
3533
3534    const int num_bytes = cntval*sizeof(jchar);
3535    int index = 0;
3536    for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) {
3537      ld(tmp1_reg, index, str1_reg);
3538      ld(tmp2_reg, index, str2_reg);
3539      cmpd(CCR0, tmp1_reg, tmp2_reg);
3540      bne(CCR0, Ldone_false);
3541    }
3542    if (cntval & 2) {
3543      lwz(tmp1_reg, index, str1_reg);
3544      lwz(tmp2_reg, index, str2_reg);
3545      cmpw(CCR0, tmp1_reg, tmp2_reg);
3546      bne(CCR0, Ldone_false);
3547      index += 4;
3548    }
3549    if (cntval & 1) {
3550      lhz(tmp1_reg, index, str1_reg);
3551      lhz(tmp2_reg, index, str2_reg);
3552      cmpw(CCR0, tmp1_reg, tmp2_reg);
3553      bne(CCR0, Ldone_false);
3554    }
3555    // fallthrough: true
3556  } else {
3557    Label Lloop;
3558    Register index_reg = tmp1_reg;
3559    const int loopcnt = cntval/4;
3560    assert(loopcnt > 0, "must be");
3561    // Offset 0 should be 32 byte aligned.
3562    //2:
3563    dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3564    dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3565    li(tmp2_reg, loopcnt);
3566    li(index_reg, 0); // init
3567    li(result_reg, 0); // assume false
3568    mtctr(tmp2_reg);
3569    //8:
3570    bind(Lloop);
3571    ldx(R0, str1_reg, index_reg);
3572    ldx(tmp2_reg, str2_reg, index_reg);
3573    cmpd(CCR0, R0, tmp2_reg);
3574    bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3575    addi(index_reg, index_reg, 4*sizeof(jchar));
3576    bdnz(Lloop);
3577    //14:
3578    if (cntval & 2) {
3579      lwzx(R0, str1_reg, index_reg);
3580      lwzx(tmp2_reg, str2_reg, index_reg);
3581      cmpw(CCR0, R0, tmp2_reg);
3582      bne(CCR0, Ldone_false);
3583      if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
3584    }
3585    if (cntval & 1) {
3586      lhzx(R0, str1_reg, index_reg);
3587      lhzx(tmp2_reg, str2_reg, index_reg);
3588      cmpw(CCR0, R0, tmp2_reg);
3589      bne(CCR0, Ldone_false);
3590    }
3591    // fallthru: true
3592  }
3593  li(result_reg, 1);
3594  bind(Ldone_false);
3595}
3596
3597// Helpers for Intrinsic Emitters
3598//
3599// Revert the byte order of a 32bit value in a register
3600//   src: 0x44556677
3601//   dst: 0x77665544
3602// Three steps to obtain the result:
3603//  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3604//     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3605//     This value initializes dst.
3606//  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3607//     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3608//     This value is mask inserted into dst with a [0..23] mask of 1s.
3609//  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3610//     This value is mask inserted into dst with a [8..15] mask of 1s.
3611void MacroAssembler::load_reverse_32(Register dst, Register src) {
3612  assert_different_registers(dst, src);
3613
3614  rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3615  rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3616  rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3617}
3618
3619// Calculate the column addresses of the crc32 lookup table into distinct registers.
3620// This loop-invariant calculation is moved out of the loop body, reducing the loop
3621// body size from 20 to 16 instructions.
3622// Returns the offset that was used to calculate the address of column tc3.
3623// Due to register shortage, setting tc3 may overwrite table. With the return offset
3624// at hand, the original table address can be easily reconstructed.
3625int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3626
3627#ifdef VM_LITTLE_ENDIAN
3628  // This is what we implement (the DOLIT4 part):
3629  // ========================================================================= */
3630  // #define DOLIT4 c ^= *buf4++; \
3631  //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3632  //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3633  // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3634  // ========================================================================= */
3635  const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
3636  const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
3637  const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
3638  const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
3639#else
3640  // This is what we implement (the DOBIG4 part):
3641  // =========================================================================
3642  // #define DOBIG4 c ^= *++buf4; \
3643  //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3644  //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3645  // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3646  // =========================================================================
3647  const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
3648  const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
3649  const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
3650  const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
3651#endif
3652  assert_different_registers(table, tc0, tc1, tc2);
3653  assert(table == tc3, "must be!");
3654
3655  if (ix0 != 0) addi(tc0, table, ix0);
3656  if (ix1 != 0) addi(tc1, table, ix1);
3657  if (ix2 != 0) addi(tc2, table, ix2);
3658  if (ix3 != 0) addi(tc3, table, ix3);
3659
3660  return ix3;
3661}
3662
3663/**
3664 * uint32_t crc;
3665 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3666 */
3667void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3668  assert_different_registers(crc, table, tmp);
3669  assert_different_registers(val, table);
3670
3671  if (crc == val) {                   // Must rotate first to use the unmodified value.
3672    rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3673                                      // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3674    srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3675  } else {
3676    srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3677    rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3678  }
3679  lwzx(tmp, table, tmp);
3680  xorr(crc, crc, tmp);
3681}
3682
3683/**
3684 * uint32_t crc;
3685 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3686 */
3687void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
3688  fold_byte_crc32(crc, crc, table, tmp);
3689}
3690
3691/**
3692 * Emits code to update CRC-32 with a byte value according to constants in table.
3693 *
3694 * @param [in,out]crc   Register containing the crc.
3695 * @param [in]val       Register containing the byte to fold into the CRC.
3696 * @param [in]table     Register containing the table of crc constants.
3697 *
3698 * uint32_t crc;
3699 * val = crc_table[(val ^ crc) & 0xFF];
3700 * crc = val ^ (crc >> 8);
3701 */
3702void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3703  BLOCK_COMMENT("update_byte_crc32:");
3704  xorr(val, val, crc);
3705  fold_byte_crc32(crc, val, table, val);
3706}
3707
3708/**
3709 * @param crc   register containing existing CRC (32-bit)
3710 * @param buf   register pointing to input byte buffer (byte*)
3711 * @param len   register containing number of bytes
3712 * @param table register pointing to CRC table
3713 */
3714void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3715                                           Register data, bool loopAlignment, bool invertCRC) {
3716  assert_different_registers(crc, buf, len, table, data);
3717
3718  Label L_mainLoop, L_done;
3719  const int mainLoop_stepping  = 1;
3720  const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3721
3722  // Process all bytes in a single-byte loop.
3723  cmpdi(CCR0, len, 0);                           // Anything to do?
3724  mtctr(len);
3725  beq(CCR0, L_done);
3726
3727  if (invertCRC) {
3728    nand(crc, crc, crc);                         // ~c
3729  }
3730
3731  align(mainLoop_alignment);
3732  BIND(L_mainLoop);
3733    lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3734    addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3735    update_byte_crc32(crc, data, table);
3736    bdnz(L_mainLoop);                            // Iterate.
3737
3738  if (invertCRC) {
3739    nand(crc, crc, crc);                         // ~c
3740  }
3741
3742  bind(L_done);
3743}
3744
3745/**
3746 * Emits code to update CRC-32 with a 4-byte value according to constants in table
3747 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3748 */
3749// A not on the lookup table address(es):
3750// The lookup table consists of two sets of four columns each.
3751// The columns {0..3} are used for little-endian machines.
3752// The columns {4..7} are used for big-endian machines.
3753// To save the effort of adding the column offset to the table address each time
3754// a table element is looked up, it is possible to pass the pre-calculated
3755// column addresses.
3756// Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3757void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3758                                        Register t0,  Register t1,  Register t2,  Register t3,
3759                                        Register tc0, Register tc1, Register tc2, Register tc3) {
3760  assert_different_registers(crc, t3);
3761
3762  // XOR crc with next four bytes of buffer.
3763  lwz(t3, bufDisp, buf);
3764  if (bufInc != 0) {
3765    addi(buf, buf, bufInc);
3766  }
3767  xorr(t3, t3, crc);
3768
3769  // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3770  rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3771  rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3772  rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3773  rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3774
3775  // Use the pre-calculated column addresses.
3776  // Load pre-calculated table values.
3777  lwzx(t0, tc0, t0);
3778  lwzx(t1, tc1, t1);
3779  lwzx(t2, tc2, t2);
3780  lwzx(t3, tc3, t3);
3781
3782  // Calculate new crc from table values.
3783  xorr(t0,  t0, t1);
3784  xorr(t2,  t2, t3);
3785  xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3786}
3787
3788/**
3789 * @param crc   register containing existing CRC (32-bit)
3790 * @param buf   register pointing to input byte buffer (byte*)
3791 * @param len   register containing number of bytes
3792 * @param table register pointing to CRC table
3793 *
3794 * Uses R9..R12 as work register. Must be saved/restored by caller!
3795 */
3796void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
3797                                        Register t0,  Register t1,  Register t2,  Register t3,
3798                                        Register tc0, Register tc1, Register tc2, Register tc3) {
3799  assert_different_registers(crc, buf, len, table);
3800
3801  Label L_mainLoop, L_tail;
3802  Register  tmp  = t0;
3803  Register  data = t0;
3804  Register  tmp2 = t1;
3805  const int mainLoop_stepping  = 8;
3806  const int tailLoop_stepping  = 1;
3807  const int log_stepping       = exact_log2(mainLoop_stepping);
3808  const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3809  const int complexThreshold   = 2*mainLoop_stepping;
3810
3811  // Don't test for len <= 0 here. This pathological case should not occur anyway.
3812  // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3813  // The situation itself is detected and handled correctly by the conditional branches
3814  // following  aghi(len, -stepping) and aghi(len, +stepping).
3815  assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3816
3817  BLOCK_COMMENT("kernel_crc32_2word {");
3818
3819  nand(crc, crc, crc);                           // ~c
3820
3821  // Check for short (<mainLoop_stepping) buffer.
3822  cmpdi(CCR0, len, complexThreshold);
3823  blt(CCR0, L_tail);
3824
3825  // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3826  // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3827  {
3828    // Align buf addr to mainLoop_stepping boundary.
3829    neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
3830    rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3831
3832    if (complexThreshold > mainLoop_stepping) {
3833      sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3834    } else {
3835      sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3836      cmpdi(CCR0, tmp, mainLoop_stepping);
3837      blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3838      mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3839    }
3840    update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3841  }
3842
3843  srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3844  andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3845  mtctr(tmp2);
3846
3847#ifdef VM_LITTLE_ENDIAN
3848  Register crc_rv = crc;
3849#else
3850  Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3851                                                 // Occupies tmp, but frees up crc.
3852  load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3853  tmp = crc;
3854#endif
3855
3856  int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3857
3858  align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3859  BIND(L_mainLoop);
3860    update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3861    update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3862    bdnz(L_mainLoop);
3863
3864#ifndef VM_LITTLE_ENDIAN
3865  load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3866  tmp = crc_rv;                                  // Tmp uses it's original register again.
3867#endif
3868
3869  // Restore original table address for tailLoop.
3870  if (reconstructTableOffset != 0) {
3871    addi(table, table, -reconstructTableOffset);
3872  }
3873
3874  // Process last few (<complexThreshold) bytes of buffer.
3875  BIND(L_tail);
3876  update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3877
3878  nand(crc, crc, crc);                           // ~c
3879  BLOCK_COMMENT("} kernel_crc32_2word");
3880}
3881
3882/**
3883 * @param crc   register containing existing CRC (32-bit)
3884 * @param buf   register pointing to input byte buffer (byte*)
3885 * @param len   register containing number of bytes
3886 * @param table register pointing to CRC table
3887 *
3888 * uses R9..R12 as work register. Must be saved/restored by caller!
3889 */
3890void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3891                                        Register t0,  Register t1,  Register t2,  Register t3,
3892                                        Register tc0, Register tc1, Register tc2, Register tc3) {
3893  assert_different_registers(crc, buf, len, table);
3894
3895  Label L_mainLoop, L_tail;
3896  Register  tmp          = t0;
3897  Register  data         = t0;
3898  Register  tmp2         = t1;
3899  const int mainLoop_stepping  = 4;
3900  const int tailLoop_stepping  = 1;
3901  const int log_stepping       = exact_log2(mainLoop_stepping);
3902  const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3903  const int complexThreshold   = 2*mainLoop_stepping;
3904
3905  // Don't test for len <= 0 here. This pathological case should not occur anyway.
3906  // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3907  // The situation itself is detected and handled correctly by the conditional branches
3908  // following  aghi(len, -stepping) and aghi(len, +stepping).
3909  assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3910
3911  BLOCK_COMMENT("kernel_crc32_1word {");
3912
3913  nand(crc, crc, crc);                           // ~c
3914
3915  // Check for short (<mainLoop_stepping) buffer.
3916  cmpdi(CCR0, len, complexThreshold);
3917  blt(CCR0, L_tail);
3918
3919  // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3920  // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3921  {
3922    // Align buf addr to mainLoop_stepping boundary.
3923    neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3924    rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3925
3926    if (complexThreshold > mainLoop_stepping) {
3927      sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3928    } else {
3929      sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3930      cmpdi(CCR0, tmp, mainLoop_stepping);
3931      blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3932      mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3933    }
3934    update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3935  }
3936
3937  srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3938  andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3939  mtctr(tmp2);
3940
3941#ifdef VM_LITTLE_ENDIAN
3942  Register crc_rv = crc;
3943#else
3944  Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3945                                                 // Occupies tmp, but frees up crc.
3946  load_reverse_32(crc_rv, crc);                  // evert byte order because we are dealing with big-endian data.
3947  tmp = crc;
3948#endif
3949
3950  int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3951
3952  align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3953  BIND(L_mainLoop);
3954    update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3955    bdnz(L_mainLoop);
3956
3957#ifndef VM_LITTLE_ENDIAN
3958  load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3959  tmp = crc_rv;                                  // Tmp uses it's original register again.
3960#endif
3961
3962  // Restore original table address for tailLoop.
3963  if (reconstructTableOffset != 0) {
3964    addi(table, table, -reconstructTableOffset);
3965  }
3966
3967  // Process last few (<complexThreshold) bytes of buffer.
3968  BIND(L_tail);
3969  update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3970
3971  nand(crc, crc, crc);                           // ~c
3972  BLOCK_COMMENT("} kernel_crc32_1word");
3973}
3974
3975/**
3976 * @param crc   register containing existing CRC (32-bit)
3977 * @param buf   register pointing to input byte buffer (byte*)
3978 * @param len   register containing number of bytes
3979 * @param table register pointing to CRC table
3980 *
3981 * Uses R7_ARG5, R8_ARG6 as work registers.
3982 */
3983void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
3984                                        Register t0,  Register t1,  Register t2,  Register t3) {
3985  assert_different_registers(crc, buf, len, table);
3986
3987  Register  data = t0;                   // Holds the current byte to be folded into crc.
3988
3989  BLOCK_COMMENT("kernel_crc32_1byte {");
3990
3991  // Process all bytes in a single-byte loop.
3992  update_byteLoop_crc32(crc, buf, len, table, data, true, true);
3993
3994  BLOCK_COMMENT("} kernel_crc32_1byte");
3995}
3996
3997void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
3998  assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
3999
4000  BLOCK_COMMENT("kernel_crc32_singleByte:");
4001  nand(crc, crc, crc);       // ~c
4002
4003  lbz(tmp, 0, buf);          // Byte from buffer, zero-extended.
4004  update_byte_crc32(crc, tmp, table);
4005
4006  nand(crc, crc, crc);       // ~c
4007}
4008
4009// dest_lo += src1 + src2
4010// dest_hi += carry1 + carry2
4011void MacroAssembler::add2_with_carry(Register dest_hi,
4012                                     Register dest_lo,
4013                                     Register src1, Register src2) {
4014  li(R0, 0);
4015  addc(dest_lo, dest_lo, src1);
4016  adde(dest_hi, dest_hi, R0);
4017  addc(dest_lo, dest_lo, src2);
4018  adde(dest_hi, dest_hi, R0);
4019}
4020
4021// Multiply 64 bit by 64 bit first loop.
4022void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4023                                           Register x_xstart,
4024                                           Register y, Register y_idx,
4025                                           Register z,
4026                                           Register carry,
4027                                           Register product_high, Register product,
4028                                           Register idx, Register kdx,
4029                                           Register tmp) {
4030  //  jlong carry, x[], y[], z[];
4031  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4032  //    huge_128 product = y[idx] * x[xstart] + carry;
4033  //    z[kdx] = (jlong)product;
4034  //    carry  = (jlong)(product >>> 64);
4035  //  }
4036  //  z[xstart] = carry;
4037
4038  Label L_first_loop, L_first_loop_exit;
4039  Label L_one_x, L_one_y, L_multiply;
4040
4041  addic_(xstart, xstart, -1);
4042  blt(CCR0, L_one_x);   // Special case: length of x is 1.
4043
4044  // Load next two integers of x.
4045  sldi(tmp, xstart, LogBytesPerInt);
4046  ldx(x_xstart, x, tmp);
4047#ifdef VM_LITTLE_ENDIAN
4048  rldicl(x_xstart, x_xstart, 32, 0);
4049#endif
4050
4051  align(32, 16);
4052  bind(L_first_loop);
4053
4054  cmpdi(CCR0, idx, 1);
4055  blt(CCR0, L_first_loop_exit);
4056  addi(idx, idx, -2);
4057  beq(CCR0, L_one_y);
4058
4059  // Load next two integers of y.
4060  sldi(tmp, idx, LogBytesPerInt);
4061  ldx(y_idx, y, tmp);
4062#ifdef VM_LITTLE_ENDIAN
4063  rldicl(y_idx, y_idx, 32, 0);
4064#endif
4065
4066
4067  bind(L_multiply);
4068  multiply64(product_high, product, x_xstart, y_idx);
4069
4070  li(tmp, 0);
4071  addc(product, product, carry);         // Add carry to result.
4072  adde(product_high, product_high, tmp); // Add carry of the last addition.
4073  addi(kdx, kdx, -2);
4074
4075  // Store result.
4076#ifdef VM_LITTLE_ENDIAN
4077  rldicl(product, product, 32, 0);
4078#endif
4079  sldi(tmp, kdx, LogBytesPerInt);
4080  stdx(product, z, tmp);
4081  mr_if_needed(carry, product_high);
4082  b(L_first_loop);
4083
4084
4085  bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4086
4087  lwz(y_idx, 0, y);
4088  b(L_multiply);
4089
4090
4091  bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4092
4093  lwz(x_xstart, 0, x);
4094  b(L_first_loop);
4095
4096  bind(L_first_loop_exit);
4097}
4098
4099// Multiply 64 bit by 64 bit and add 128 bit.
4100void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4101                                            Register z, Register yz_idx,
4102                                            Register idx, Register carry,
4103                                            Register product_high, Register product,
4104                                            Register tmp, int offset) {
4105
4106  //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4107  //  z[kdx] = (jlong)product;
4108
4109  sldi(tmp, idx, LogBytesPerInt);
4110  if (offset) {
4111    addi(tmp, tmp, offset);
4112  }
4113  ldx(yz_idx, y, tmp);
4114#ifdef VM_LITTLE_ENDIAN
4115  rldicl(yz_idx, yz_idx, 32, 0);
4116#endif
4117
4118  multiply64(product_high, product, x_xstart, yz_idx);
4119  ldx(yz_idx, z, tmp);
4120#ifdef VM_LITTLE_ENDIAN
4121  rldicl(yz_idx, yz_idx, 32, 0);
4122#endif
4123
4124  add2_with_carry(product_high, product, carry, yz_idx);
4125
4126  sldi(tmp, idx, LogBytesPerInt);
4127  if (offset) {
4128    addi(tmp, tmp, offset);
4129  }
4130#ifdef VM_LITTLE_ENDIAN
4131  rldicl(product, product, 32, 0);
4132#endif
4133  stdx(product, z, tmp);
4134}
4135
4136// Multiply 128 bit by 128 bit. Unrolled inner loop.
4137void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4138                                             Register y, Register z,
4139                                             Register yz_idx, Register idx, Register carry,
4140                                             Register product_high, Register product,
4141                                             Register carry2, Register tmp) {
4142
4143  //  jlong carry, x[], y[], z[];
4144  //  int kdx = ystart+1;
4145  //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4146  //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4147  //    z[kdx+idx+1] = (jlong)product;
4148  //    jlong carry2 = (jlong)(product >>> 64);
4149  //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4150  //    z[kdx+idx] = (jlong)product;
4151  //    carry = (jlong)(product >>> 64);
4152  //  }
4153  //  idx += 2;
4154  //  if (idx > 0) {
4155  //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4156  //    z[kdx+idx] = (jlong)product;
4157  //    carry = (jlong)(product >>> 64);
4158  //  }
4159
4160  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4161  const Register jdx = R0;
4162
4163  // Scale the index.
4164  srdi_(jdx, idx, 2);
4165  beq(CCR0, L_third_loop_exit);
4166  mtctr(jdx);
4167
4168  align(32, 16);
4169  bind(L_third_loop);
4170
4171  addi(idx, idx, -4);
4172
4173  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4174  mr_if_needed(carry2, product_high);
4175
4176  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4177  mr_if_needed(carry, product_high);
4178  bdnz(L_third_loop);
4179
4180  bind(L_third_loop_exit);  // Handle any left-over operand parts.
4181
4182  andi_(idx, idx, 0x3);
4183  beq(CCR0, L_post_third_loop_done);
4184
4185  Label L_check_1;
4186
4187  addic_(idx, idx, -2);
4188  blt(CCR0, L_check_1);
4189
4190  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4191  mr_if_needed(carry, product_high);
4192
4193  bind(L_check_1);
4194
4195  addi(idx, idx, 0x2);
4196  andi_(idx, idx, 0x1);
4197  addic_(idx, idx, -1);
4198  blt(CCR0, L_post_third_loop_done);
4199
4200  sldi(tmp, idx, LogBytesPerInt);
4201  lwzx(yz_idx, y, tmp);
4202  multiply64(product_high, product, x_xstart, yz_idx);
4203  lwzx(yz_idx, z, tmp);
4204
4205  add2_with_carry(product_high, product, yz_idx, carry);
4206
4207  sldi(tmp, idx, LogBytesPerInt);
4208  stwx(product, z, tmp);
4209  srdi(product, product, 32);
4210
4211  sldi(product_high, product_high, 32);
4212  orr(product, product, product_high);
4213  mr_if_needed(carry, product);
4214
4215  bind(L_post_third_loop_done);
4216}   // multiply_128_x_128_loop
4217
4218void MacroAssembler::multiply_to_len(Register x, Register xlen,
4219                                     Register y, Register ylen,
4220                                     Register z, Register zlen,
4221                                     Register tmp1, Register tmp2,
4222                                     Register tmp3, Register tmp4,
4223                                     Register tmp5, Register tmp6,
4224                                     Register tmp7, Register tmp8,
4225                                     Register tmp9, Register tmp10,
4226                                     Register tmp11, Register tmp12,
4227                                     Register tmp13) {
4228
4229  ShortBranchVerifier sbv(this);
4230
4231  assert_different_registers(x, xlen, y, ylen, z, zlen,
4232                             tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4233  assert_different_registers(x, xlen, y, ylen, z, zlen,
4234                             tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4235  assert_different_registers(x, xlen, y, ylen, z, zlen,
4236                             tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4237
4238  const Register idx = tmp1;
4239  const Register kdx = tmp2;
4240  const Register xstart = tmp3;
4241
4242  const Register y_idx = tmp4;
4243  const Register carry = tmp5;
4244  const Register product = tmp6;
4245  const Register product_high = tmp7;
4246  const Register x_xstart = tmp8;
4247  const Register tmp = tmp9;
4248
4249  // First Loop.
4250  //
4251  //  final static long LONG_MASK = 0xffffffffL;
4252  //  int xstart = xlen - 1;
4253  //  int ystart = ylen - 1;
4254  //  long carry = 0;
4255  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4256  //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4257  //    z[kdx] = (int)product;
4258  //    carry = product >>> 32;
4259  //  }
4260  //  z[xstart] = (int)carry;
4261
4262  mr_if_needed(idx, ylen);        // idx = ylen
4263  mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4264  li(carry, 0);                   // carry = 0
4265
4266  Label L_done;
4267
4268  addic_(xstart, xlen, -1);
4269  blt(CCR0, L_done);
4270
4271  multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4272                        carry, product_high, product, idx, kdx, tmp);
4273
4274  Label L_second_loop;
4275
4276  cmpdi(CCR0, kdx, 0);
4277  beq(CCR0, L_second_loop);
4278
4279  Label L_carry;
4280
4281  addic_(kdx, kdx, -1);
4282  beq(CCR0, L_carry);
4283
4284  // Store lower 32 bits of carry.
4285  sldi(tmp, kdx, LogBytesPerInt);
4286  stwx(carry, z, tmp);
4287  srdi(carry, carry, 32);
4288  addi(kdx, kdx, -1);
4289
4290
4291  bind(L_carry);
4292
4293  // Store upper 32 bits of carry.
4294  sldi(tmp, kdx, LogBytesPerInt);
4295  stwx(carry, z, tmp);
4296
4297  // Second and third (nested) loops.
4298  //
4299  //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4300  //    carry = 0;
4301  //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4302  //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4303  //                     (z[k] & LONG_MASK) + carry;
4304  //      z[k] = (int)product;
4305  //      carry = product >>> 32;
4306  //    }
4307  //    z[i] = (int)carry;
4308  //  }
4309  //
4310  //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4311
4312  bind(L_second_loop);
4313
4314  li(carry, 0);                   // carry = 0;
4315
4316  addic_(xstart, xstart, -1);     // i = xstart-1;
4317  blt(CCR0, L_done);
4318
4319  Register zsave = tmp10;
4320
4321  mr(zsave, z);
4322
4323
4324  Label L_last_x;
4325
4326  sldi(tmp, xstart, LogBytesPerInt);
4327  add(z, z, tmp);                 // z = z + k - j
4328  addi(z, z, 4);
4329  addic_(xstart, xstart, -1);     // i = xstart-1;
4330  blt(CCR0, L_last_x);
4331
4332  sldi(tmp, xstart, LogBytesPerInt);
4333  ldx(x_xstart, x, tmp);
4334#ifdef VM_LITTLE_ENDIAN
4335  rldicl(x_xstart, x_xstart, 32, 0);
4336#endif
4337
4338
4339  Label L_third_loop_prologue;
4340
4341  bind(L_third_loop_prologue);
4342
4343  Register xsave = tmp11;
4344  Register xlensave = tmp12;
4345  Register ylensave = tmp13;
4346
4347  mr(xsave, x);
4348  mr(xlensave, xstart);
4349  mr(ylensave, ylen);
4350
4351
4352  multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4353                          carry, product_high, product, x, tmp);
4354
4355  mr(z, zsave);
4356  mr(x, xsave);
4357  mr(xlen, xlensave);   // This is the decrement of the loop counter!
4358  mr(ylen, ylensave);
4359
4360  addi(tmp3, xlen, 1);
4361  sldi(tmp, tmp3, LogBytesPerInt);
4362  stwx(carry, z, tmp);
4363  addic_(tmp3, tmp3, -1);
4364  blt(CCR0, L_done);
4365
4366  srdi(carry, carry, 32);
4367  sldi(tmp, tmp3, LogBytesPerInt);
4368  stwx(carry, z, tmp);
4369  b(L_second_loop);
4370
4371  // Next infrequent code is moved outside loops.
4372  bind(L_last_x);
4373
4374  lwz(x_xstart, 0, x);
4375  b(L_third_loop_prologue);
4376
4377  bind(L_done);
4378}   // multiply_to_len
4379
4380void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
4381#ifdef ASSERT
4382  Label ok;
4383  if (check_equal) {
4384    beq(CCR0, ok);
4385  } else {
4386    bne(CCR0, ok);
4387  }
4388  stop(msg, id);
4389  bind(ok);
4390#endif
4391}
4392
4393void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4394                                          Register mem_base, const char* msg, int id) {
4395#ifdef ASSERT
4396  switch (size) {
4397    case 4:
4398      lwz(R0, mem_offset, mem_base);
4399      cmpwi(CCR0, R0, 0);
4400      break;
4401    case 8:
4402      ld(R0, mem_offset, mem_base);
4403      cmpdi(CCR0, R0, 0);
4404      break;
4405    default:
4406      ShouldNotReachHere();
4407  }
4408  asm_assert(check_equal, msg, id);
4409#endif // ASSERT
4410}
4411
4412void MacroAssembler::verify_thread() {
4413  if (VerifyThread) {
4414    unimplemented("'VerifyThread' currently not implemented on PPC");
4415  }
4416}
4417
4418// READ: oop. KILL: R0. Volatile floats perhaps.
4419void MacroAssembler::verify_oop(Register oop, const char* msg) {
4420  if (!VerifyOops) {
4421    return;
4422  }
4423
4424  address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4425  const Register tmp = R11; // Will be preserved.
4426  const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4427  save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4428
4429  mr_if_needed(R4_ARG2, oop);
4430  save_LR_CR(tmp); // save in old frame
4431  push_frame_reg_args(nbytes_save, tmp);
4432  // load FunctionDescriptor** / entry_address *
4433  load_const_optimized(tmp, fd, R0);
4434  // load FunctionDescriptor* / entry_address
4435  ld(tmp, 0, tmp);
4436  load_const_optimized(R3_ARG1, (address)msg, R0);
4437  // Call destination for its side effect.
4438  call_c(tmp);
4439
4440  pop_frame();
4441  restore_LR_CR(tmp);
4442  restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4443}
4444
4445void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4446  if (!VerifyOops) {
4447    return;
4448  }
4449
4450  address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4451  const Register tmp = R11; // Will be preserved.
4452  const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4453  save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4454
4455  ld(R4_ARG2, offs, base);
4456  save_LR_CR(tmp); // save in old frame
4457  push_frame_reg_args(nbytes_save, tmp);
4458  // load FunctionDescriptor** / entry_address *
4459  load_const_optimized(tmp, fd, R0);
4460  // load FunctionDescriptor* / entry_address
4461  ld(tmp, 0, tmp);
4462  load_const_optimized(R3_ARG1, (address)msg, R0);
4463  // Call destination for its side effect.
4464  call_c(tmp);
4465
4466  pop_frame();
4467  restore_LR_CR(tmp);
4468  restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4469}
4470
4471const char* stop_types[] = {
4472  "stop",
4473  "untested",
4474  "unimplemented",
4475  "shouldnotreachhere"
4476};
4477
4478static void stop_on_request(int tp, const char* msg) {
4479  tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
4480  guarantee(false, "PPC assembly code requires stop: %s", msg);
4481}
4482
4483// Call a C-function that prints output.
4484void MacroAssembler::stop(int type, const char* msg, int id) {
4485#ifndef PRODUCT
4486  block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
4487#else
4488  block_comment("stop {");
4489#endif
4490
4491  // setup arguments
4492  load_const_optimized(R3_ARG1, type);
4493  load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
4494  call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
4495  illtrap();
4496  emit_int32(id);
4497  block_comment("} stop;");
4498}
4499
4500#ifndef PRODUCT
4501// Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4502// Val, addr are temp registers.
4503// If low == addr, addr is killed.
4504// High is preserved.
4505void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4506  if (!ZapMemory) return;
4507
4508  assert_different_registers(low, val);
4509
4510  BLOCK_COMMENT("zap memory region {");
4511  load_const_optimized(val, 0x0101010101010101);
4512  int size = before + after;
4513  if (low == high && size < 5 && size > 0) {
4514    int offset = -before*BytesPerWord;
4515    for (int i = 0; i < size; ++i) {
4516      std(val, offset, low);
4517      offset += (1*BytesPerWord);
4518    }
4519  } else {
4520    addi(addr, low, -before*BytesPerWord);
4521    assert_different_registers(high, val);
4522    if (after) addi(high, high, after * BytesPerWord);
4523    Label loop;
4524    bind(loop);
4525    std(val, 0, addr);
4526    addi(addr, addr, 8);
4527    cmpd(CCR6, addr, high);
4528    ble(CCR6, loop);
4529    if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4530  }
4531  BLOCK_COMMENT("} zap memory region");
4532}
4533
4534#endif // !PRODUCT
4535
4536SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4537  int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4538  assert(sizeof(bool) == 1, "PowerPC ABI");
4539  masm->lbz(temp, simm16_offset, temp);
4540  masm->cmpwi(CCR0, temp, 0);
4541  masm->beq(CCR0, _label);
4542}
4543
4544SkipIfEqualZero::~SkipIfEqualZero() {
4545  _masm->bind(_label);
4546}
4547