1/*
2 * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2012, 2017 SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26#include "precompiled.hpp"
27#include "asm/macroAssembler.inline.hpp"
28#include "compiler/disassembler.hpp"
29#include "gc/shared/cardTableModRefBS.hpp"
30#include "gc/shared/collectedHeap.inline.hpp"
31#include "interpreter/interpreter.hpp"
32#include "memory/resourceArea.hpp"
33#include "nativeInst_ppc.hpp"
34#include "prims/methodHandles.hpp"
35#include "runtime/biasedLocking.hpp"
36#include "runtime/icache.hpp"
37#include "runtime/interfaceSupport.hpp"
38#include "runtime/objectMonitor.hpp"
39#include "runtime/os.hpp"
40#include "runtime/sharedRuntime.hpp"
41#include "runtime/stubRoutines.hpp"
42#include "utilities/macros.hpp"
43#if INCLUDE_ALL_GCS
44#include "gc/g1/g1CollectedHeap.inline.hpp"
45#include "gc/g1/g1SATBCardTableModRefBS.hpp"
46#include "gc/g1/heapRegion.hpp"
47#endif // INCLUDE_ALL_GCS
48#ifdef COMPILER2
49#include "opto/intrinsicnode.hpp"
50#endif
51
52#ifdef PRODUCT
53#define BLOCK_COMMENT(str) // nothing
54#else
55#define BLOCK_COMMENT(str) block_comment(str)
56#endif
57#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
58
59#ifdef ASSERT
60// On RISC, there's no benefit to verifying instruction boundaries.
61bool AbstractAssembler::pd_check_instruction_mark() { return false; }
62#endif
63
64void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
65  assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
66  if (Assembler::is_simm(si31, 16)) {
67    ld(d, si31, a);
68    if (emit_filler_nop) nop();
69  } else {
70    const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
71    const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
72    addis(d, a, hi);
73    ld(d, lo, d);
74  }
75}
76
77void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
78  assert_different_registers(d, a);
79  ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
80}
81
82void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
83                                      size_t size_in_bytes, bool is_signed) {
84  switch (size_in_bytes) {
85  case  8:              ld(dst, offs, base);                         break;
86  case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
87  case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
88  case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
89  default:  ShouldNotReachHere();
90  }
91}
92
93void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
94                                       size_t size_in_bytes) {
95  switch (size_in_bytes) {
96  case  8:  std(dst, offs, base); break;
97  case  4:  stw(dst, offs, base); break;
98  case  2:  sth(dst, offs, base); break;
99  case  1:  stb(dst, offs, base); break;
100  default:  ShouldNotReachHere();
101  }
102}
103
104void MacroAssembler::align(int modulus, int max, int rem) {
105  int padding = (rem + modulus - (offset() % modulus)) % modulus;
106  if (padding > max) return;
107  for (int c = (padding >> 2); c > 0; --c) { nop(); }
108}
109
110// Issue instructions that calculate given TOC from global TOC.
111void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
112                                                       bool add_relocation, bool emit_dummy_addr) {
113  int offset = -1;
114  if (emit_dummy_addr) {
115    offset = -128; // dummy address
116  } else if (addr != (address)(intptr_t)-1) {
117    offset = MacroAssembler::offset_to_global_toc(addr);
118  }
119
120  if (hi16) {
121    addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
122  }
123  if (lo16) {
124    if (add_relocation) {
125      // Relocate at the addi to avoid confusion with a load from the method's TOC.
126      relocate(internal_word_Relocation::spec(addr));
127    }
128    addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
129  }
130}
131
132int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
133  const int offset = MacroAssembler::offset_to_global_toc(addr);
134
135  const address inst2_addr = a;
136  const int inst2 = *(int *)inst2_addr;
137
138  // The relocation points to the second instruction, the addi,
139  // and the addi reads and writes the same register dst.
140  const int dst = inv_rt_field(inst2);
141  assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
142
143  // Now, find the preceding addis which writes to dst.
144  int inst1 = 0;
145  address inst1_addr = inst2_addr - BytesPerInstWord;
146  while (inst1_addr >= bound) {
147    inst1 = *(int *) inst1_addr;
148    if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
149      // Stop, found the addis which writes dst.
150      break;
151    }
152    inst1_addr -= BytesPerInstWord;
153  }
154
155  assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
156  set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
157  set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
158  return (int)((intptr_t)addr - (intptr_t)inst1_addr);
159}
160
161address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
162  const address inst2_addr = a;
163  const int inst2 = *(int *)inst2_addr;
164
165  // The relocation points to the second instruction, the addi,
166  // and the addi reads and writes the same register dst.
167  const int dst = inv_rt_field(inst2);
168  assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
169
170  // Now, find the preceding addis which writes to dst.
171  int inst1 = 0;
172  address inst1_addr = inst2_addr - BytesPerInstWord;
173  while (inst1_addr >= bound) {
174    inst1 = *(int *) inst1_addr;
175    if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
176      // stop, found the addis which writes dst
177      break;
178    }
179    inst1_addr -= BytesPerInstWord;
180  }
181
182  assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
183
184  int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
185  // -1 is a special case
186  if (offset == -1) {
187    return (address)(intptr_t)-1;
188  } else {
189    return global_toc() + offset;
190  }
191}
192
193#ifdef _LP64
194// Patch compressed oops or klass constants.
195// Assembler sequence is
196// 1) compressed oops:
197//    lis  rx = const.hi
198//    ori rx = rx | const.lo
199// 2) compressed klass:
200//    lis  rx = const.hi
201//    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
202//    ori rx = rx | const.lo
203// Clrldi will be passed by.
204int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
205  assert(UseCompressedOops, "Should only patch compressed oops");
206
207  const address inst2_addr = a;
208  const int inst2 = *(int *)inst2_addr;
209
210  // The relocation points to the second instruction, the ori,
211  // and the ori reads and writes the same register dst.
212  const int dst = inv_rta_field(inst2);
213  assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
214  // Now, find the preceding addis which writes to dst.
215  int inst1 = 0;
216  address inst1_addr = inst2_addr - BytesPerInstWord;
217  bool inst1_found = false;
218  while (inst1_addr >= bound) {
219    inst1 = *(int *)inst1_addr;
220    if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
221    inst1_addr -= BytesPerInstWord;
222  }
223  assert(inst1_found, "inst is not lis");
224
225  int xc = (data >> 16) & 0xffff;
226  int xd = (data >>  0) & 0xffff;
227
228  set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
229  set_imm((int *)inst2_addr,        (xd)); // unsigned int
230  return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
231}
232
233// Get compressed oop or klass constant.
234narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
235  assert(UseCompressedOops, "Should only patch compressed oops");
236
237  const address inst2_addr = a;
238  const int inst2 = *(int *)inst2_addr;
239
240  // The relocation points to the second instruction, the ori,
241  // and the ori reads and writes the same register dst.
242  const int dst = inv_rta_field(inst2);
243  assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
244  // Now, find the preceding lis which writes to dst.
245  int inst1 = 0;
246  address inst1_addr = inst2_addr - BytesPerInstWord;
247  bool inst1_found = false;
248
249  while (inst1_addr >= bound) {
250    inst1 = *(int *) inst1_addr;
251    if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
252    inst1_addr -= BytesPerInstWord;
253  }
254  assert(inst1_found, "inst is not lis");
255
256  uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
257  uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
258
259  return (int) (xl | xh);
260}
261#endif // _LP64
262
263// Returns true if successful.
264bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
265                                                Register toc, bool fixed_size) {
266  int toc_offset = 0;
267  // Use RelocationHolder::none for the constant pool entry, otherwise
268  // we will end up with a failing NativeCall::verify(x) where x is
269  // the address of the constant pool entry.
270  // FIXME: We should insert relocation information for oops at the constant
271  // pool entries instead of inserting it at the loads; patching of a constant
272  // pool entry should be less expensive.
273  address const_address = address_constant((address)a.value(), RelocationHolder::none);
274  if (const_address == NULL) { return false; } // allocation failure
275  // Relocate at the pc of the load.
276  relocate(a.rspec());
277  toc_offset = (int)(const_address - code()->consts()->start());
278  ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
279  return true;
280}
281
282bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
283  const address inst1_addr = a;
284  const int inst1 = *(int *)inst1_addr;
285
286   // The relocation points to the ld or the addis.
287   return (is_ld(inst1)) ||
288          (is_addis(inst1) && inv_ra_field(inst1) != 0);
289}
290
291int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
292  assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
293
294  const address inst1_addr = a;
295  const int inst1 = *(int *)inst1_addr;
296
297  if (is_ld(inst1)) {
298    return inv_d1_field(inst1);
299  } else if (is_addis(inst1)) {
300    const int dst = inv_rt_field(inst1);
301
302    // Now, find the succeeding ld which reads and writes to dst.
303    address inst2_addr = inst1_addr + BytesPerInstWord;
304    int inst2 = 0;
305    while (true) {
306      inst2 = *(int *) inst2_addr;
307      if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
308        // Stop, found the ld which reads and writes dst.
309        break;
310      }
311      inst2_addr += BytesPerInstWord;
312    }
313    return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
314  }
315  ShouldNotReachHere();
316  return 0;
317}
318
319// Get the constant from a `load_const' sequence.
320long MacroAssembler::get_const(address a) {
321  assert(is_load_const_at(a), "not a load of a constant");
322  const int *p = (const int*) a;
323  unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
324  if (is_ori(*(p+1))) {
325    x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
326    x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
327    x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
328  } else if (is_lis(*(p+1))) {
329    x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
330    x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
331    x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
332  } else {
333    ShouldNotReachHere();
334    return (long) 0;
335  }
336  return (long) x;
337}
338
339// Patch the 64 bit constant of a `load_const' sequence. This is a low
340// level procedure. It neither flushes the instruction cache nor is it
341// mt safe.
342void MacroAssembler::patch_const(address a, long x) {
343  assert(is_load_const_at(a), "not a load of a constant");
344  int *p = (int*) a;
345  if (is_ori(*(p+1))) {
346    set_imm(0 + p, (x >> 48) & 0xffff);
347    set_imm(1 + p, (x >> 32) & 0xffff);
348    set_imm(3 + p, (x >> 16) & 0xffff);
349    set_imm(4 + p, x & 0xffff);
350  } else if (is_lis(*(p+1))) {
351    set_imm(0 + p, (x >> 48) & 0xffff);
352    set_imm(2 + p, (x >> 32) & 0xffff);
353    set_imm(1 + p, (x >> 16) & 0xffff);
354    set_imm(3 + p, x & 0xffff);
355  } else {
356    ShouldNotReachHere();
357  }
358}
359
360AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
361  assert(oop_recorder() != NULL, "this assembler needs a Recorder");
362  int index = oop_recorder()->allocate_metadata_index(obj);
363  RelocationHolder rspec = metadata_Relocation::spec(index);
364  return AddressLiteral((address)obj, rspec);
365}
366
367AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
368  assert(oop_recorder() != NULL, "this assembler needs a Recorder");
369  int index = oop_recorder()->find_index(obj);
370  RelocationHolder rspec = metadata_Relocation::spec(index);
371  return AddressLiteral((address)obj, rspec);
372}
373
374AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
375  assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
376  int oop_index = oop_recorder()->allocate_oop_index(obj);
377  return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
378}
379
380AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
381  assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
382  int oop_index = oop_recorder()->find_index(obj);
383  return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
384}
385
386RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
387                                                      Register tmp, int offset) {
388  intptr_t value = *delayed_value_addr;
389  if (value != 0) {
390    return RegisterOrConstant(value + offset);
391  }
392
393  // Load indirectly to solve generation ordering problem.
394  // static address, no relocation
395  int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
396  ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
397
398  if (offset != 0) {
399    addi(tmp, tmp, offset);
400  }
401
402  return RegisterOrConstant(tmp);
403}
404
405#ifndef PRODUCT
406void MacroAssembler::pd_print_patched_instruction(address branch) {
407  Unimplemented(); // TODO: PPC port
408}
409#endif // ndef PRODUCT
410
411// Conditional far branch for destinations encodable in 24+2 bits.
412void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
413
414  // If requested by flag optimize, relocate the bc_far as a
415  // runtime_call and prepare for optimizing it when the code gets
416  // relocated.
417  if (optimize == bc_far_optimize_on_relocate) {
418    relocate(relocInfo::runtime_call_type);
419  }
420
421  // variant 2:
422  //
423  //    b!cxx SKIP
424  //    bxx   DEST
425  //  SKIP:
426  //
427
428  const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
429                                                opposite_bcond(inv_boint_bcond(boint)));
430
431  // We emit two branches.
432  // First, a conditional branch which jumps around the far branch.
433  const address not_taken_pc = pc() + 2 * BytesPerInstWord;
434  const address bc_pc        = pc();
435  bc(opposite_boint, biint, not_taken_pc);
436
437  const int bc_instr = *(int*)bc_pc;
438  assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
439  assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
440  assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
441                                     opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
442         "postcondition");
443  assert(biint == inv_bi_field(bc_instr), "postcondition");
444
445  // Second, an unconditional far branch which jumps to dest.
446  // Note: target(dest) remembers the current pc (see CodeSection::target)
447  //       and returns the current pc if the label is not bound yet; when
448  //       the label gets bound, the unconditional far branch will be patched.
449  const address target_pc = target(dest);
450  const address b_pc  = pc();
451  b(target_pc);
452
453  assert(not_taken_pc == pc(),                     "postcondition");
454  assert(dest.is_bound() || target_pc == b_pc, "postcondition");
455}
456
457// 1 or 2 instructions
458void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
459  if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
460    bc(boint, biint, dest);
461  } else {
462    bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
463  }
464}
465
466bool MacroAssembler::is_bc_far_at(address instruction_addr) {
467  return is_bc_far_variant1_at(instruction_addr) ||
468         is_bc_far_variant2_at(instruction_addr) ||
469         is_bc_far_variant3_at(instruction_addr);
470}
471
472address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
473  if (is_bc_far_variant1_at(instruction_addr)) {
474    const address instruction_1_addr = instruction_addr;
475    const int instruction_1 = *(int*)instruction_1_addr;
476    return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
477  } else if (is_bc_far_variant2_at(instruction_addr)) {
478    const address instruction_2_addr = instruction_addr + 4;
479    return bxx_destination(instruction_2_addr);
480  } else if (is_bc_far_variant3_at(instruction_addr)) {
481    return instruction_addr + 8;
482  }
483  // variant 4 ???
484  ShouldNotReachHere();
485  return NULL;
486}
487void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
488
489  if (is_bc_far_variant3_at(instruction_addr)) {
490    // variant 3, far cond branch to the next instruction, already patched to nops:
491    //
492    //    nop
493    //    endgroup
494    //  SKIP/DEST:
495    //
496    return;
497  }
498
499  // first, extract boint and biint from the current branch
500  int boint = 0;
501  int biint = 0;
502
503  ResourceMark rm;
504  const int code_size = 2 * BytesPerInstWord;
505  CodeBuffer buf(instruction_addr, code_size);
506  MacroAssembler masm(&buf);
507  if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
508    // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
509    masm.nop();
510    masm.endgroup();
511  } else {
512    if (is_bc_far_variant1_at(instruction_addr)) {
513      // variant 1, the 1st instruction contains the destination address:
514      //
515      //    bcxx  DEST
516      //    nop
517      //
518      const int instruction_1 = *(int*)(instruction_addr);
519      boint = inv_bo_field(instruction_1);
520      biint = inv_bi_field(instruction_1);
521    } else if (is_bc_far_variant2_at(instruction_addr)) {
522      // variant 2, the 2nd instruction contains the destination address:
523      //
524      //    b!cxx SKIP
525      //    bxx   DEST
526      //  SKIP:
527      //
528      const int instruction_1 = *(int*)(instruction_addr);
529      boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
530          opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
531      biint = inv_bi_field(instruction_1);
532    } else {
533      // variant 4???
534      ShouldNotReachHere();
535    }
536
537    // second, set the new branch destination and optimize the code
538    if (dest != instruction_addr + 4 && // the bc_far is still unbound!
539        masm.is_within_range_of_bcxx(dest, instruction_addr)) {
540      // variant 1:
541      //
542      //    bcxx  DEST
543      //    nop
544      //
545      masm.bc(boint, biint, dest);
546      masm.nop();
547    } else {
548      // variant 2:
549      //
550      //    b!cxx SKIP
551      //    bxx   DEST
552      //  SKIP:
553      //
554      const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
555                                                    opposite_bcond(inv_boint_bcond(boint)));
556      const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
557      masm.bc(opposite_boint, biint, not_taken_pc);
558      masm.b(dest);
559    }
560  }
561  ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
562}
563
564// Emit a NOT mt-safe patchable 64 bit absolute call/jump.
565void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
566  // get current pc
567  uint64_t start_pc = (uint64_t) pc();
568
569  const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
570  const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
571
572  // relocate here
573  if (rt != relocInfo::none) {
574    relocate(rt);
575  }
576
577  if ( ReoptimizeCallSequences &&
578       (( link && is_within_range_of_b(dest, pc_of_bl)) ||
579        (!link && is_within_range_of_b(dest, pc_of_b)))) {
580    // variant 2:
581    // Emit an optimized, pc-relative call/jump.
582
583    if (link) {
584      // some padding
585      nop();
586      nop();
587      nop();
588      nop();
589      nop();
590      nop();
591
592      // do the call
593      assert(pc() == pc_of_bl, "just checking");
594      bl(dest, relocInfo::none);
595    } else {
596      // do the jump
597      assert(pc() == pc_of_b, "just checking");
598      b(dest, relocInfo::none);
599
600      // some padding
601      nop();
602      nop();
603      nop();
604      nop();
605      nop();
606      nop();
607    }
608
609    // Assert that we can identify the emitted call/jump.
610    assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
611           "can't identify emitted call");
612  } else {
613    // variant 1:
614    mr(R0, R11);  // spill R11 -> R0.
615
616    // Load the destination address into CTR,
617    // calculate destination relative to global toc.
618    calculate_address_from_global_toc(R11, dest, true, true, false);
619
620    mtctr(R11);
621    mr(R11, R0);  // spill R11 <- R0.
622    nop();
623
624    // do the call/jump
625    if (link) {
626      bctrl();
627    } else{
628      bctr();
629    }
630    // Assert that we can identify the emitted call/jump.
631    assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
632           "can't identify emitted call");
633  }
634
635  // Assert that we can identify the emitted call/jump.
636  assert(is_bxx64_patchable_at((address)start_pc, link),
637         "can't identify emitted call");
638  assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
639         "wrong encoding of dest address");
640}
641
642// Identify a bxx64_patchable instruction.
643bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
644  return is_bxx64_patchable_variant1b_at(instruction_addr, link)
645    //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
646      || is_bxx64_patchable_variant2_at(instruction_addr, link);
647}
648
649// Does the call64_patchable instruction use a pc-relative encoding of
650// the call destination?
651bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
652  // variant 2 is pc-relative
653  return is_bxx64_patchable_variant2_at(instruction_addr, link);
654}
655
656// Identify variant 1.
657bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
658  unsigned int* instr = (unsigned int*) instruction_addr;
659  return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
660      && is_mtctr(instr[5]) // mtctr
661    && is_load_const_at(instruction_addr);
662}
663
664// Identify variant 1b: load destination relative to global toc.
665bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
666  unsigned int* instr = (unsigned int*) instruction_addr;
667  return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
668    && is_mtctr(instr[3]) // mtctr
669    && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
670}
671
672// Identify variant 2.
673bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
674  unsigned int* instr = (unsigned int*) instruction_addr;
675  if (link) {
676    return is_bl (instr[6])  // bl dest is last
677      && is_nop(instr[0])  // nop
678      && is_nop(instr[1])  // nop
679      && is_nop(instr[2])  // nop
680      && is_nop(instr[3])  // nop
681      && is_nop(instr[4])  // nop
682      && is_nop(instr[5]); // nop
683  } else {
684    return is_b  (instr[0])  // b  dest is first
685      && is_nop(instr[1])  // nop
686      && is_nop(instr[2])  // nop
687      && is_nop(instr[3])  // nop
688      && is_nop(instr[4])  // nop
689      && is_nop(instr[5])  // nop
690      && is_nop(instr[6]); // nop
691  }
692}
693
694// Set dest address of a bxx64_patchable instruction.
695void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
696  ResourceMark rm;
697  int code_size = MacroAssembler::bxx64_patchable_size;
698  CodeBuffer buf(instruction_addr, code_size);
699  MacroAssembler masm(&buf);
700  masm.bxx64_patchable(dest, relocInfo::none, link);
701  ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
702}
703
704// Get dest address of a bxx64_patchable instruction.
705address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
706  if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
707    return (address) (unsigned long) get_const(instruction_addr);
708  } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
709    unsigned int* instr = (unsigned int*) instruction_addr;
710    if (link) {
711      const int instr_idx = 6; // bl is last
712      int branchoffset = branch_destination(instr[instr_idx], 0);
713      return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
714    } else {
715      const int instr_idx = 0; // b is first
716      int branchoffset = branch_destination(instr[instr_idx], 0);
717      return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
718    }
719  // Load dest relative to global toc.
720  } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
721    return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
722                                                               instruction_addr);
723  } else {
724    ShouldNotReachHere();
725    return NULL;
726  }
727}
728
729// Uses ordering which corresponds to ABI:
730//    _savegpr0_14:  std  r14,-144(r1)
731//    _savegpr0_15:  std  r15,-136(r1)
732//    _savegpr0_16:  std  r16,-128(r1)
733void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
734  std(R14, offset, dst);   offset += 8;
735  std(R15, offset, dst);   offset += 8;
736  std(R16, offset, dst);   offset += 8;
737  std(R17, offset, dst);   offset += 8;
738  std(R18, offset, dst);   offset += 8;
739  std(R19, offset, dst);   offset += 8;
740  std(R20, offset, dst);   offset += 8;
741  std(R21, offset, dst);   offset += 8;
742  std(R22, offset, dst);   offset += 8;
743  std(R23, offset, dst);   offset += 8;
744  std(R24, offset, dst);   offset += 8;
745  std(R25, offset, dst);   offset += 8;
746  std(R26, offset, dst);   offset += 8;
747  std(R27, offset, dst);   offset += 8;
748  std(R28, offset, dst);   offset += 8;
749  std(R29, offset, dst);   offset += 8;
750  std(R30, offset, dst);   offset += 8;
751  std(R31, offset, dst);   offset += 8;
752
753  stfd(F14, offset, dst);   offset += 8;
754  stfd(F15, offset, dst);   offset += 8;
755  stfd(F16, offset, dst);   offset += 8;
756  stfd(F17, offset, dst);   offset += 8;
757  stfd(F18, offset, dst);   offset += 8;
758  stfd(F19, offset, dst);   offset += 8;
759  stfd(F20, offset, dst);   offset += 8;
760  stfd(F21, offset, dst);   offset += 8;
761  stfd(F22, offset, dst);   offset += 8;
762  stfd(F23, offset, dst);   offset += 8;
763  stfd(F24, offset, dst);   offset += 8;
764  stfd(F25, offset, dst);   offset += 8;
765  stfd(F26, offset, dst);   offset += 8;
766  stfd(F27, offset, dst);   offset += 8;
767  stfd(F28, offset, dst);   offset += 8;
768  stfd(F29, offset, dst);   offset += 8;
769  stfd(F30, offset, dst);   offset += 8;
770  stfd(F31, offset, dst);
771}
772
773// Uses ordering which corresponds to ABI:
774//    _restgpr0_14:  ld   r14,-144(r1)
775//    _restgpr0_15:  ld   r15,-136(r1)
776//    _restgpr0_16:  ld   r16,-128(r1)
777void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
778  ld(R14, offset, src);   offset += 8;
779  ld(R15, offset, src);   offset += 8;
780  ld(R16, offset, src);   offset += 8;
781  ld(R17, offset, src);   offset += 8;
782  ld(R18, offset, src);   offset += 8;
783  ld(R19, offset, src);   offset += 8;
784  ld(R20, offset, src);   offset += 8;
785  ld(R21, offset, src);   offset += 8;
786  ld(R22, offset, src);   offset += 8;
787  ld(R23, offset, src);   offset += 8;
788  ld(R24, offset, src);   offset += 8;
789  ld(R25, offset, src);   offset += 8;
790  ld(R26, offset, src);   offset += 8;
791  ld(R27, offset, src);   offset += 8;
792  ld(R28, offset, src);   offset += 8;
793  ld(R29, offset, src);   offset += 8;
794  ld(R30, offset, src);   offset += 8;
795  ld(R31, offset, src);   offset += 8;
796
797  // FP registers
798  lfd(F14, offset, src);   offset += 8;
799  lfd(F15, offset, src);   offset += 8;
800  lfd(F16, offset, src);   offset += 8;
801  lfd(F17, offset, src);   offset += 8;
802  lfd(F18, offset, src);   offset += 8;
803  lfd(F19, offset, src);   offset += 8;
804  lfd(F20, offset, src);   offset += 8;
805  lfd(F21, offset, src);   offset += 8;
806  lfd(F22, offset, src);   offset += 8;
807  lfd(F23, offset, src);   offset += 8;
808  lfd(F24, offset, src);   offset += 8;
809  lfd(F25, offset, src);   offset += 8;
810  lfd(F26, offset, src);   offset += 8;
811  lfd(F27, offset, src);   offset += 8;
812  lfd(F28, offset, src);   offset += 8;
813  lfd(F29, offset, src);   offset += 8;
814  lfd(F30, offset, src);   offset += 8;
815  lfd(F31, offset, src);
816}
817
818// For verify_oops.
819void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
820  std(R2,  offset, dst);   offset += 8;
821  std(R3,  offset, dst);   offset += 8;
822  std(R4,  offset, dst);   offset += 8;
823  std(R5,  offset, dst);   offset += 8;
824  std(R6,  offset, dst);   offset += 8;
825  std(R7,  offset, dst);   offset += 8;
826  std(R8,  offset, dst);   offset += 8;
827  std(R9,  offset, dst);   offset += 8;
828  std(R10, offset, dst);   offset += 8;
829  std(R11, offset, dst);   offset += 8;
830  std(R12, offset, dst);   offset += 8;
831
832  stfd(F0, offset, dst);   offset += 8;
833  stfd(F1, offset, dst);   offset += 8;
834  stfd(F2, offset, dst);   offset += 8;
835  stfd(F3, offset, dst);   offset += 8;
836  stfd(F4, offset, dst);   offset += 8;
837  stfd(F5, offset, dst);   offset += 8;
838  stfd(F6, offset, dst);   offset += 8;
839  stfd(F7, offset, dst);   offset += 8;
840  stfd(F8, offset, dst);   offset += 8;
841  stfd(F9, offset, dst);   offset += 8;
842  stfd(F10, offset, dst);  offset += 8;
843  stfd(F11, offset, dst);  offset += 8;
844  stfd(F12, offset, dst);  offset += 8;
845  stfd(F13, offset, dst);
846}
847
848// For verify_oops.
849void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
850  ld(R2,  offset, src);   offset += 8;
851  ld(R3,  offset, src);   offset += 8;
852  ld(R4,  offset, src);   offset += 8;
853  ld(R5,  offset, src);   offset += 8;
854  ld(R6,  offset, src);   offset += 8;
855  ld(R7,  offset, src);   offset += 8;
856  ld(R8,  offset, src);   offset += 8;
857  ld(R9,  offset, src);   offset += 8;
858  ld(R10, offset, src);   offset += 8;
859  ld(R11, offset, src);   offset += 8;
860  ld(R12, offset, src);   offset += 8;
861
862  lfd(F0, offset, src);   offset += 8;
863  lfd(F1, offset, src);   offset += 8;
864  lfd(F2, offset, src);   offset += 8;
865  lfd(F3, offset, src);   offset += 8;
866  lfd(F4, offset, src);   offset += 8;
867  lfd(F5, offset, src);   offset += 8;
868  lfd(F6, offset, src);   offset += 8;
869  lfd(F7, offset, src);   offset += 8;
870  lfd(F8, offset, src);   offset += 8;
871  lfd(F9, offset, src);   offset += 8;
872  lfd(F10, offset, src);  offset += 8;
873  lfd(F11, offset, src);  offset += 8;
874  lfd(F12, offset, src);  offset += 8;
875  lfd(F13, offset, src);
876}
877
878void MacroAssembler::save_LR_CR(Register tmp) {
879  mfcr(tmp);
880  std(tmp, _abi(cr), R1_SP);
881  mflr(tmp);
882  std(tmp, _abi(lr), R1_SP);
883  // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
884}
885
886void MacroAssembler::restore_LR_CR(Register tmp) {
887  assert(tmp != R1_SP, "must be distinct");
888  ld(tmp, _abi(lr), R1_SP);
889  mtlr(tmp);
890  ld(tmp, _abi(cr), R1_SP);
891  mtcr(tmp);
892}
893
894address MacroAssembler::get_PC_trash_LR(Register result) {
895  Label L;
896  bl(L);
897  bind(L);
898  address lr_pc = pc();
899  mflr(result);
900  return lr_pc;
901}
902
903void MacroAssembler::resize_frame(Register offset, Register tmp) {
904#ifdef ASSERT
905  assert_different_registers(offset, tmp, R1_SP);
906  andi_(tmp, offset, frame::alignment_in_bytes-1);
907  asm_assert_eq("resize_frame: unaligned", 0x204);
908#endif
909
910  // tmp <- *(SP)
911  ld(tmp, _abi(callers_sp), R1_SP);
912  // addr <- SP + offset;
913  // *(addr) <- tmp;
914  // SP <- addr
915  stdux(tmp, R1_SP, offset);
916}
917
918void MacroAssembler::resize_frame(int offset, Register tmp) {
919  assert(is_simm(offset, 16), "too big an offset");
920  assert_different_registers(tmp, R1_SP);
921  assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
922  // tmp <- *(SP)
923  ld(tmp, _abi(callers_sp), R1_SP);
924  // addr <- SP + offset;
925  // *(addr) <- tmp;
926  // SP <- addr
927  stdu(tmp, offset, R1_SP);
928}
929
930void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
931  // (addr == tmp1) || (addr == tmp2) is allowed here!
932  assert(tmp1 != tmp2, "must be distinct");
933
934  // compute offset w.r.t. current stack pointer
935  // tmp_1 <- addr - SP (!)
936  subf(tmp1, R1_SP, addr);
937
938  // atomically update SP keeping back link.
939  resize_frame(tmp1/* offset */, tmp2/* tmp */);
940}
941
942void MacroAssembler::push_frame(Register bytes, Register tmp) {
943#ifdef ASSERT
944  assert(bytes != R0, "r0 not allowed here");
945  andi_(R0, bytes, frame::alignment_in_bytes-1);
946  asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
947#endif
948  neg(tmp, bytes);
949  stdux(R1_SP, R1_SP, tmp);
950}
951
952// Push a frame of size `bytes'.
953void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
954  long offset = align_addr(bytes, frame::alignment_in_bytes);
955  if (is_simm(-offset, 16)) {
956    stdu(R1_SP, -offset, R1_SP);
957  } else {
958    load_const_optimized(tmp, -offset);
959    stdux(R1_SP, R1_SP, tmp);
960  }
961}
962
963// Push a frame of size `bytes' plus abi_reg_args on top.
964void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
965  push_frame(bytes + frame::abi_reg_args_size, tmp);
966}
967
968// Setup up a new C frame with a spill area for non-volatile GPRs and
969// additional space for local variables.
970void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
971                                                      Register tmp) {
972  push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
973}
974
975// Pop current C frame.
976void MacroAssembler::pop_frame() {
977  ld(R1_SP, _abi(callers_sp), R1_SP);
978}
979
980#if defined(ABI_ELFv2)
981address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
982  // TODO(asmundak): make sure the caller uses R12 as function descriptor
983  // most of the times.
984  if (R12 != r_function_entry) {
985    mr(R12, r_function_entry);
986  }
987  mtctr(R12);
988  // Do a call or a branch.
989  if (and_link) {
990    bctrl();
991  } else {
992    bctr();
993  }
994  _last_calls_return_pc = pc();
995
996  return _last_calls_return_pc;
997}
998
999// Call a C function via a function descriptor and use full C
1000// calling conventions. Updates and returns _last_calls_return_pc.
1001address MacroAssembler::call_c(Register r_function_entry) {
1002  return branch_to(r_function_entry, /*and_link=*/true);
1003}
1004
1005// For tail calls: only branch, don't link, so callee returns to caller of this function.
1006address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1007  return branch_to(r_function_entry, /*and_link=*/false);
1008}
1009
1010address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1011  load_const(R12, function_entry, R0);
1012  return branch_to(R12,  /*and_link=*/true);
1013}
1014
1015#else
1016// Generic version of a call to C function via a function descriptor
1017// with variable support for C calling conventions (TOC, ENV, etc.).
1018// Updates and returns _last_calls_return_pc.
1019address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1020                                  bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1021  // we emit standard ptrgl glue code here
1022  assert((function_descriptor != R0), "function_descriptor cannot be R0");
1023
1024  // retrieve necessary entries from the function descriptor
1025  ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1026  mtctr(R0);
1027
1028  if (load_toc_of_callee) {
1029    ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1030  }
1031  if (load_env_of_callee) {
1032    ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1033  } else if (load_toc_of_callee) {
1034    li(R11, 0);
1035  }
1036
1037  // do a call or a branch
1038  if (and_link) {
1039    bctrl();
1040  } else {
1041    bctr();
1042  }
1043  _last_calls_return_pc = pc();
1044
1045  return _last_calls_return_pc;
1046}
1047
1048// Call a C function via a function descriptor and use full C calling
1049// conventions.
1050// We don't use the TOC in generated code, so there is no need to save
1051// and restore its value.
1052address MacroAssembler::call_c(Register fd) {
1053  return branch_to(fd, /*and_link=*/true,
1054                       /*save toc=*/false,
1055                       /*restore toc=*/false,
1056                       /*load toc=*/true,
1057                       /*load env=*/true);
1058}
1059
1060address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1061  return branch_to(fd, /*and_link=*/false,
1062                       /*save toc=*/false,
1063                       /*restore toc=*/false,
1064                       /*load toc=*/true,
1065                       /*load env=*/true);
1066}
1067
1068address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1069  if (rt != relocInfo::none) {
1070    // this call needs to be relocatable
1071    if (!ReoptimizeCallSequences
1072        || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1073        || fd == NULL   // support code-size estimation
1074        || !fd->is_friend_function()
1075        || fd->entry() == NULL) {
1076      // it's not a friend function as defined by class FunctionDescriptor,
1077      // so do a full call-c here.
1078      load_const(R11, (address)fd, R0);
1079
1080      bool has_env = (fd != NULL && fd->env() != NULL);
1081      return branch_to(R11, /*and_link=*/true,
1082                            /*save toc=*/false,
1083                            /*restore toc=*/false,
1084                            /*load toc=*/true,
1085                            /*load env=*/has_env);
1086    } else {
1087      // It's a friend function. Load the entry point and don't care about
1088      // toc and env. Use an optimizable call instruction, but ensure the
1089      // same code-size as in the case of a non-friend function.
1090      nop();
1091      nop();
1092      nop();
1093      bl64_patchable(fd->entry(), rt);
1094      _last_calls_return_pc = pc();
1095      return _last_calls_return_pc;
1096    }
1097  } else {
1098    // This call does not need to be relocatable, do more aggressive
1099    // optimizations.
1100    if (!ReoptimizeCallSequences
1101      || !fd->is_friend_function()) {
1102      // It's not a friend function as defined by class FunctionDescriptor,
1103      // so do a full call-c here.
1104      load_const(R11, (address)fd, R0);
1105      return branch_to(R11, /*and_link=*/true,
1106                            /*save toc=*/false,
1107                            /*restore toc=*/false,
1108                            /*load toc=*/true,
1109                            /*load env=*/true);
1110    } else {
1111      // it's a friend function, load the entry point and don't care about
1112      // toc and env.
1113      address dest = fd->entry();
1114      if (is_within_range_of_b(dest, pc())) {
1115        bl(dest);
1116      } else {
1117        bl64_patchable(dest, rt);
1118      }
1119      _last_calls_return_pc = pc();
1120      return _last_calls_return_pc;
1121    }
1122  }
1123}
1124
1125// Call a C function.  All constants needed reside in TOC.
1126//
1127// Read the address to call from the TOC.
1128// Read env from TOC, if fd specifies an env.
1129// Read new TOC from TOC.
1130address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1131                                         relocInfo::relocType rt, Register toc) {
1132  if (!ReoptimizeCallSequences
1133    || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1134    || !fd->is_friend_function()) {
1135    // It's not a friend function as defined by class FunctionDescriptor,
1136    // so do a full call-c here.
1137    assert(fd->entry() != NULL, "function must be linked");
1138
1139    AddressLiteral fd_entry(fd->entry());
1140    bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1141    mtctr(R11);
1142    if (fd->env() == NULL) {
1143      li(R11, 0);
1144      nop();
1145    } else {
1146      AddressLiteral fd_env(fd->env());
1147      success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1148    }
1149    AddressLiteral fd_toc(fd->toc());
1150    // Set R2_TOC (load from toc)
1151    success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1152    bctrl();
1153    _last_calls_return_pc = pc();
1154    if (!success) { return NULL; }
1155  } else {
1156    // It's a friend function, load the entry point and don't care about
1157    // toc and env. Use an optimizable call instruction, but ensure the
1158    // same code-size as in the case of a non-friend function.
1159    nop();
1160    bl64_patchable(fd->entry(), rt);
1161    _last_calls_return_pc = pc();
1162  }
1163  return _last_calls_return_pc;
1164}
1165#endif // ABI_ELFv2
1166
1167void MacroAssembler::call_VM_base(Register oop_result,
1168                                  Register last_java_sp,
1169                                  address  entry_point,
1170                                  bool     check_exceptions) {
1171  BLOCK_COMMENT("call_VM {");
1172  // Determine last_java_sp register.
1173  if (!last_java_sp->is_valid()) {
1174    last_java_sp = R1_SP;
1175  }
1176  set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1177
1178  // ARG1 must hold thread address.
1179  mr(R3_ARG1, R16_thread);
1180#if defined(ABI_ELFv2)
1181  address return_pc = call_c(entry_point, relocInfo::none);
1182#else
1183  address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1184#endif
1185
1186  reset_last_Java_frame();
1187
1188  // Check for pending exceptions.
1189  if (check_exceptions) {
1190    // We don't check for exceptions here.
1191    ShouldNotReachHere();
1192  }
1193
1194  // Get oop result if there is one and reset the value in the thread.
1195  if (oop_result->is_valid()) {
1196    get_vm_result(oop_result);
1197  }
1198
1199  _last_calls_return_pc = return_pc;
1200  BLOCK_COMMENT("} call_VM");
1201}
1202
1203void MacroAssembler::call_VM_leaf_base(address entry_point) {
1204  BLOCK_COMMENT("call_VM_leaf {");
1205#if defined(ABI_ELFv2)
1206  call_c(entry_point, relocInfo::none);
1207#else
1208  call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1209#endif
1210  BLOCK_COMMENT("} call_VM_leaf");
1211}
1212
1213void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1214  call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1215}
1216
1217void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1218                             bool check_exceptions) {
1219  // R3_ARG1 is reserved for the thread.
1220  mr_if_needed(R4_ARG2, arg_1);
1221  call_VM(oop_result, entry_point, check_exceptions);
1222}
1223
1224void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1225                             bool check_exceptions) {
1226  // R3_ARG1 is reserved for the thread
1227  mr_if_needed(R4_ARG2, arg_1);
1228  assert(arg_2 != R4_ARG2, "smashed argument");
1229  mr_if_needed(R5_ARG3, arg_2);
1230  call_VM(oop_result, entry_point, check_exceptions);
1231}
1232
1233void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1234                             bool check_exceptions) {
1235  // R3_ARG1 is reserved for the thread
1236  mr_if_needed(R4_ARG2, arg_1);
1237  assert(arg_2 != R4_ARG2, "smashed argument");
1238  mr_if_needed(R5_ARG3, arg_2);
1239  mr_if_needed(R6_ARG4, arg_3);
1240  call_VM(oop_result, entry_point, check_exceptions);
1241}
1242
1243void MacroAssembler::call_VM_leaf(address entry_point) {
1244  call_VM_leaf_base(entry_point);
1245}
1246
1247void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1248  mr_if_needed(R3_ARG1, arg_1);
1249  call_VM_leaf(entry_point);
1250}
1251
1252void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1253  mr_if_needed(R3_ARG1, arg_1);
1254  assert(arg_2 != R3_ARG1, "smashed argument");
1255  mr_if_needed(R4_ARG2, arg_2);
1256  call_VM_leaf(entry_point);
1257}
1258
1259void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1260  mr_if_needed(R3_ARG1, arg_1);
1261  assert(arg_2 != R3_ARG1, "smashed argument");
1262  mr_if_needed(R4_ARG2, arg_2);
1263  assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1264  mr_if_needed(R5_ARG3, arg_3);
1265  call_VM_leaf(entry_point);
1266}
1267
1268// Check whether instruction is a read access to the polling page
1269// which was emitted by load_from_polling_page(..).
1270bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1271                                               address* polling_address_ptr) {
1272  if (!is_ld(instruction))
1273    return false; // It's not a ld. Fail.
1274
1275  int rt = inv_rt_field(instruction);
1276  int ra = inv_ra_field(instruction);
1277  int ds = inv_ds_field(instruction);
1278  if (!(ds == 0 && ra != 0 && rt == 0)) {
1279    return false; // It's not a ld(r0, X, ra). Fail.
1280  }
1281
1282  if (!ucontext) {
1283    // Set polling address.
1284    if (polling_address_ptr != NULL) {
1285      *polling_address_ptr = NULL;
1286    }
1287    return true; // No ucontext given. Can't check value of ra. Assume true.
1288  }
1289
1290#ifdef LINUX
1291  // Ucontext given. Check that register ra contains the address of
1292  // the safepoing polling page.
1293  ucontext_t* uc = (ucontext_t*) ucontext;
1294  // Set polling address.
1295  address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1296  if (polling_address_ptr != NULL) {
1297    *polling_address_ptr = addr;
1298  }
1299  return os::is_poll_address(addr);
1300#else
1301  // Not on Linux, ucontext must be NULL.
1302  ShouldNotReachHere();
1303  return false;
1304#endif
1305}
1306
1307bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1308#ifdef LINUX
1309  ucontext_t* uc = (ucontext_t*) ucontext;
1310
1311  if (is_stwx(instruction) || is_stwux(instruction)) {
1312    int ra = inv_ra_field(instruction);
1313    int rb = inv_rb_field(instruction);
1314
1315    // look up content of ra and rb in ucontext
1316    address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1317    long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1318    return os::is_memory_serialize_page(thread, ra_val+rb_val);
1319  } else if (is_stw(instruction) || is_stwu(instruction)) {
1320    int ra = inv_ra_field(instruction);
1321    int d1 = inv_d1_field(instruction);
1322
1323    // look up content of ra in ucontext
1324    address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1325    return os::is_memory_serialize_page(thread, ra_val+d1);
1326  } else {
1327    return false;
1328  }
1329#else
1330  // workaround not needed on !LINUX :-)
1331  ShouldNotCallThis();
1332  return false;
1333#endif
1334}
1335
1336void MacroAssembler::bang_stack_with_offset(int offset) {
1337  // When increasing the stack, the old stack pointer will be written
1338  // to the new top of stack according to the PPC64 abi.
1339  // Therefore, stack banging is not necessary when increasing
1340  // the stack by <= os::vm_page_size() bytes.
1341  // When increasing the stack by a larger amount, this method is
1342  // called repeatedly to bang the intermediate pages.
1343
1344  // Stack grows down, caller passes positive offset.
1345  assert(offset > 0, "must bang with positive offset");
1346
1347  long stdoffset = -offset;
1348
1349  if (is_simm(stdoffset, 16)) {
1350    // Signed 16 bit offset, a simple std is ok.
1351    if (UseLoadInstructionsForStackBangingPPC64) {
1352      ld(R0, (int)(signed short)stdoffset, R1_SP);
1353    } else {
1354      std(R0,(int)(signed short)stdoffset, R1_SP);
1355    }
1356  } else if (is_simm(stdoffset, 31)) {
1357    const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1358    const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1359
1360    Register tmp = R11;
1361    addis(tmp, R1_SP, hi);
1362    if (UseLoadInstructionsForStackBangingPPC64) {
1363      ld(R0,  lo, tmp);
1364    } else {
1365      std(R0, lo, tmp);
1366    }
1367  } else {
1368    ShouldNotReachHere();
1369  }
1370}
1371
1372// If instruction is a stack bang of the form
1373//    std    R0,    x(Ry),       (see bang_stack_with_offset())
1374//    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1375// or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1376// return the banged address. Otherwise, return 0.
1377address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1378#ifdef LINUX
1379  ucontext_t* uc = (ucontext_t*) ucontext;
1380  int rs = inv_rs_field(instruction);
1381  int ra = inv_ra_field(instruction);
1382  if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1383      || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1384      || (is_stdu(instruction) && rs == 1)) {
1385    int ds = inv_ds_field(instruction);
1386    // return banged address
1387    return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1388  } else if (is_stdux(instruction) && rs == 1) {
1389    int rb = inv_rb_field(instruction);
1390    address sp = (address)uc->uc_mcontext.regs->gpr[1];
1391    long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1392    return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1393                                  : sp + rb_val; // banged address
1394  }
1395  return NULL; // not a stack bang
1396#else
1397  // workaround not needed on !LINUX :-)
1398  ShouldNotCallThis();
1399  return NULL;
1400#endif
1401}
1402
1403void MacroAssembler::reserved_stack_check(Register return_pc) {
1404  // Test if reserved zone needs to be enabled.
1405  Label no_reserved_zone_enabling;
1406
1407  ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1408  cmpld(CCR0, R1_SP, R0);
1409  blt_predict_taken(CCR0, no_reserved_zone_enabling);
1410
1411  // Enable reserved zone again, throw stack overflow exception.
1412  push_frame_reg_args(0, R0);
1413  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1414  pop_frame();
1415  mtlr(return_pc);
1416  load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1417  mtctr(R0);
1418  bctr();
1419
1420  should_not_reach_here();
1421
1422  bind(no_reserved_zone_enabling);
1423}
1424
1425void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1426                                bool cmpxchgx_hint) {
1427  Label retry;
1428  bind(retry);
1429  ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1430  stdcx_(exchange_value, addr_base);
1431  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1432    bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1433  } else {
1434    bne(                  CCR0, retry); // StXcx_ sets CCR0.
1435  }
1436}
1437
1438void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1439                                Register tmp, bool cmpxchgx_hint) {
1440  Label retry;
1441  bind(retry);
1442  ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1443  add(tmp, dest_current_value, inc_value);
1444  stdcx_(tmp, addr_base);
1445  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1446    bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1447  } else {
1448    bne(                  CCR0, retry); // StXcx_ sets CCR0.
1449  }
1450}
1451
1452// Word/sub-word atomic helper functions
1453
1454// Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1455// Only signed types are supported with size < 4.
1456// Atomic add always kills tmp1.
1457void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1458                                                   Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1459                                                   bool cmpxchgx_hint, bool is_add, int size) {
1460  // Sub-word instructions are available since Power 8.
1461  // For older processors, instruction_type != size holds, and we
1462  // emulate the sub-word instructions by constructing a 4-byte value
1463  // that leaves the other bytes unchanged.
1464  const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1465
1466  Label retry;
1467  Register shift_amount = noreg,
1468           val32 = dest_current_value,
1469           modval = is_add ? tmp1 : exchange_value;
1470
1471  if (instruction_type != size) {
1472    assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1473    modval = tmp1;
1474    shift_amount = tmp2;
1475    val32 = tmp3;
1476    // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1477#ifdef VM_LITTLE_ENDIAN
1478    rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1479    clrrdi(addr_base, addr_base, 2);
1480#else
1481    xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1482    clrrdi(addr_base, addr_base, 2);
1483    rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1484#endif
1485  }
1486
1487  // atomic emulation loop
1488  bind(retry);
1489
1490  switch (instruction_type) {
1491    case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1492    case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1493    case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1494    default: ShouldNotReachHere();
1495  }
1496
1497  if (instruction_type != size) {
1498    srw(dest_current_value, val32, shift_amount);
1499  }
1500
1501  if (is_add) { add(modval, dest_current_value, exchange_value); }
1502
1503  if (instruction_type != size) {
1504    // Transform exchange value such that the replacement can be done by one xor instruction.
1505    xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1506    clrldi(modval, modval, (size == 1) ? 56 : 48);
1507    slw(modval, modval, shift_amount);
1508    xorr(modval, val32, modval);
1509  }
1510
1511  switch (instruction_type) {
1512    case 4: stwcx_(modval, addr_base); break;
1513    case 2: sthcx_(modval, addr_base); break;
1514    case 1: stbcx_(modval, addr_base); break;
1515    default: ShouldNotReachHere();
1516  }
1517
1518  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1519    bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1520  } else {
1521    bne(                  CCR0, retry); // StXcx_ sets CCR0.
1522  }
1523
1524  // l?arx zero-extends, but Java wants byte/short values sign-extended.
1525  if (size == 1) {
1526    extsb(dest_current_value, dest_current_value);
1527  } else if (size == 2) {
1528    extsh(dest_current_value, dest_current_value);
1529  };
1530}
1531
1532// Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1533// Only signed types are supported with size < 4.
1534void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1535                                       Register compare_value, Register exchange_value,
1536                                       Register addr_base, Register tmp1, Register tmp2,
1537                                       Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1538  // Sub-word instructions are available since Power 8.
1539  // For older processors, instruction_type != size holds, and we
1540  // emulate the sub-word instructions by constructing a 4-byte value
1541  // that leaves the other bytes unchanged.
1542  const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1543
1544  Register shift_amount = noreg,
1545           val32 = dest_current_value,
1546           modval = exchange_value;
1547
1548  if (instruction_type != size) {
1549    assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1550    shift_amount = tmp1;
1551    val32 = tmp2;
1552    modval = tmp2;
1553    // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1554#ifdef VM_LITTLE_ENDIAN
1555    rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1556    clrrdi(addr_base, addr_base, 2);
1557#else
1558    xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1559    clrrdi(addr_base, addr_base, 2);
1560    rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1561#endif
1562    // Transform exchange value such that the replacement can be done by one xor instruction.
1563    xorr(exchange_value, compare_value, exchange_value);
1564    clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1565    slw(exchange_value, exchange_value, shift_amount);
1566  }
1567
1568  // atomic emulation loop
1569  bind(retry);
1570
1571  switch (instruction_type) {
1572    case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1573    case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1574    case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1575    default: ShouldNotReachHere();
1576  }
1577
1578  if (instruction_type != size) {
1579    srw(dest_current_value, val32, shift_amount);
1580  }
1581  if (size == 1) {
1582    extsb(dest_current_value, dest_current_value);
1583  } else if (size == 2) {
1584    extsh(dest_current_value, dest_current_value);
1585  };
1586
1587  cmpw(flag, dest_current_value, compare_value);
1588  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1589    bne_predict_not_taken(flag, failed);
1590  } else {
1591    bne(                  flag, failed);
1592  }
1593  // branch to done  => (flag == ne), (dest_current_value != compare_value)
1594  // fall through    => (flag == eq), (dest_current_value == compare_value)
1595
1596  if (instruction_type != size) {
1597    xorr(modval, val32, exchange_value);
1598  }
1599
1600  switch (instruction_type) {
1601    case 4: stwcx_(modval, addr_base); break;
1602    case 2: sthcx_(modval, addr_base); break;
1603    case 1: stbcx_(modval, addr_base); break;
1604    default: ShouldNotReachHere();
1605  }
1606}
1607
1608// CmpxchgX sets condition register to cmpX(current, compare).
1609void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1610                                     Register compare_value, Register exchange_value,
1611                                     Register addr_base, Register tmp1, Register tmp2,
1612                                     int semantics, bool cmpxchgx_hint,
1613                                     Register int_flag_success, bool contention_hint, bool weak, int size) {
1614  Label retry;
1615  Label failed;
1616  Label done;
1617
1618  // Save one branch if result is returned via register and
1619  // result register is different from the other ones.
1620  bool use_result_reg    = (int_flag_success != noreg);
1621  bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1622                            int_flag_success != exchange_value && int_flag_success != addr_base &&
1623                            int_flag_success != tmp1 && int_flag_success != tmp2);
1624  assert(!weak || flag == CCR0, "weak only supported with CCR0");
1625  assert(size == 1 || size == 2 || size == 4, "unsupported");
1626
1627  if (use_result_reg && preset_result_reg) {
1628    li(int_flag_success, 0); // preset (assume cas failed)
1629  }
1630
1631  // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1632  if (contention_hint) { // Don't try to reserve if cmp fails.
1633    switch (size) {
1634      case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1635      case 2: lha(dest_current_value, 0, addr_base); break;
1636      case 4: lwz(dest_current_value, 0, addr_base); break;
1637      default: ShouldNotReachHere();
1638    }
1639    cmpw(flag, dest_current_value, compare_value);
1640    bne(flag, failed);
1641  }
1642
1643  // release/fence semantics
1644  if (semantics & MemBarRel) {
1645    release();
1646  }
1647
1648  cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1649                    retry, failed, cmpxchgx_hint, size);
1650  if (!weak || use_result_reg) {
1651    if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1652      bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1653    } else {
1654      bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1655    }
1656  }
1657  // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1658
1659  // Result in register (must do this at the end because int_flag_success can be the
1660  // same register as one above).
1661  if (use_result_reg) {
1662    li(int_flag_success, 1);
1663  }
1664
1665  if (semantics & MemBarFenceAfter) {
1666    fence();
1667  } else if (semantics & MemBarAcq) {
1668    isync();
1669  }
1670
1671  if (use_result_reg && !preset_result_reg) {
1672    b(done);
1673  }
1674
1675  bind(failed);
1676  if (use_result_reg && !preset_result_reg) {
1677    li(int_flag_success, 0);
1678  }
1679
1680  bind(done);
1681  // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1682  // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1683}
1684
1685// Preforms atomic compare exchange:
1686//   if (compare_value == *addr_base)
1687//     *addr_base = exchange_value
1688//     int_flag_success = 1;
1689//   else
1690//     int_flag_success = 0;
1691//
1692// ConditionRegister flag       = cmp(compare_value, *addr_base)
1693// Register dest_current_value  = *addr_base
1694// Register compare_value       Used to compare with value in memory
1695// Register exchange_value      Written to memory if compare_value == *addr_base
1696// Register addr_base           The memory location to compareXChange
1697// Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1698//
1699// To avoid the costly compare exchange the value is tested beforehand.
1700// Several special cases exist to avoid that unnecessary information is generated.
1701//
1702void MacroAssembler::cmpxchgd(ConditionRegister flag,
1703                              Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1704                              Register addr_base, int semantics, bool cmpxchgx_hint,
1705                              Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1706  Label retry;
1707  Label failed_int;
1708  Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1709  Label done;
1710
1711  // Save one branch if result is returned via register and result register is different from the other ones.
1712  bool use_result_reg    = (int_flag_success!=noreg);
1713  bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1714                            int_flag_success!=exchange_value && int_flag_success!=addr_base);
1715  assert(!weak || flag == CCR0, "weak only supported with CCR0");
1716  assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1717
1718  if (use_result_reg && preset_result_reg) {
1719    li(int_flag_success, 0); // preset (assume cas failed)
1720  }
1721
1722  // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1723  if (contention_hint) { // Don't try to reserve if cmp fails.
1724    ld(dest_current_value, 0, addr_base);
1725    cmpd(flag, compare_value, dest_current_value);
1726    bne(flag, failed);
1727  }
1728
1729  // release/fence semantics
1730  if (semantics & MemBarRel) {
1731    release();
1732  }
1733
1734  // atomic emulation loop
1735  bind(retry);
1736
1737  ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1738  cmpd(flag, compare_value, dest_current_value);
1739  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1740    bne_predict_not_taken(flag, failed);
1741  } else {
1742    bne(                  flag, failed);
1743  }
1744
1745  stdcx_(exchange_value, addr_base);
1746  if (!weak || use_result_reg || failed_ext) {
1747    if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1748      bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1749    } else {
1750      bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1751    }
1752  }
1753
1754  // result in register (must do this at the end because int_flag_success can be the same register as one above)
1755  if (use_result_reg) {
1756    li(int_flag_success, 1);
1757  }
1758
1759  if (semantics & MemBarFenceAfter) {
1760    fence();
1761  } else if (semantics & MemBarAcq) {
1762    isync();
1763  }
1764
1765  if (use_result_reg && !preset_result_reg) {
1766    b(done);
1767  }
1768
1769  bind(failed_int);
1770  if (use_result_reg && !preset_result_reg) {
1771    li(int_flag_success, 0);
1772  }
1773
1774  bind(done);
1775  // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1776  // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1777}
1778
1779// Look up the method for a megamorphic invokeinterface call.
1780// The target method is determined by <intf_klass, itable_index>.
1781// The receiver klass is in recv_klass.
1782// On success, the result will be in method_result, and execution falls through.
1783// On failure, execution transfers to the given label.
1784void MacroAssembler::lookup_interface_method(Register recv_klass,
1785                                             Register intf_klass,
1786                                             RegisterOrConstant itable_index,
1787                                             Register method_result,
1788                                             Register scan_temp,
1789                                             Register sethi_temp,
1790                                             Label& L_no_such_interface) {
1791  assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1792  assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1793         "caller must use same register for non-constant itable index as for method");
1794
1795  // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1796  int vtable_base = in_bytes(Klass::vtable_start_offset());
1797  int itentry_off = itableMethodEntry::method_offset_in_bytes();
1798  int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1799  int scan_step   = itableOffsetEntry::size() * wordSize;
1800  int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1801
1802  lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1803  // %%% We should store the aligned, prescaled offset in the klassoop.
1804  // Then the next several instructions would fold away.
1805
1806  sldi(scan_temp, scan_temp, log_vte_size);
1807  addi(scan_temp, scan_temp, vtable_base);
1808  add(scan_temp, recv_klass, scan_temp);
1809
1810  // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1811  if (itable_index.is_register()) {
1812    Register itable_offset = itable_index.as_register();
1813    sldi(itable_offset, itable_offset, logMEsize);
1814    if (itentry_off) addi(itable_offset, itable_offset, itentry_off);
1815    add(recv_klass, itable_offset, recv_klass);
1816  } else {
1817    long itable_offset = (long)itable_index.as_constant();
1818    load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation
1819    add(recv_klass, sethi_temp, recv_klass);
1820  }
1821
1822  // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1823  //   if (scan->interface() == intf) {
1824  //     result = (klass + scan->offset() + itable_index);
1825  //   }
1826  // }
1827  Label search, found_method;
1828
1829  for (int peel = 1; peel >= 0; peel--) {
1830    // %%%% Could load both offset and interface in one ldx, if they were
1831    // in the opposite order. This would save a load.
1832    ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1833
1834    // Check that this entry is non-null. A null entry means that
1835    // the receiver class doesn't implement the interface, and wasn't the
1836    // same as when the caller was compiled.
1837    cmpd(CCR0, method_result, intf_klass);
1838
1839    if (peel) {
1840      beq(CCR0, found_method);
1841    } else {
1842      bne(CCR0, search);
1843      // (invert the test to fall through to found_method...)
1844    }
1845
1846    if (!peel) break;
1847
1848    bind(search);
1849
1850    cmpdi(CCR0, method_result, 0);
1851    beq(CCR0, L_no_such_interface);
1852    addi(scan_temp, scan_temp, scan_step);
1853  }
1854
1855  bind(found_method);
1856
1857  // Got a hit.
1858  int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1859  lwz(scan_temp, ito_offset, scan_temp);
1860  ldx(method_result, scan_temp, recv_klass);
1861}
1862
1863// virtual method calling
1864void MacroAssembler::lookup_virtual_method(Register recv_klass,
1865                                           RegisterOrConstant vtable_index,
1866                                           Register method_result) {
1867
1868  assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1869
1870  const int base = in_bytes(Klass::vtable_start_offset());
1871  assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1872
1873  if (vtable_index.is_register()) {
1874    sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1875    add(recv_klass, vtable_index.as_register(), recv_klass);
1876  } else {
1877    addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1878  }
1879  ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1880}
1881
1882/////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1883void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1884                                                   Register super_klass,
1885                                                   Register temp1_reg,
1886                                                   Register temp2_reg,
1887                                                   Label* L_success,
1888                                                   Label* L_failure,
1889                                                   Label* L_slow_path,
1890                                                   RegisterOrConstant super_check_offset) {
1891
1892  const Register check_cache_offset = temp1_reg;
1893  const Register cached_super       = temp2_reg;
1894
1895  assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1896
1897  int sco_offset = in_bytes(Klass::super_check_offset_offset());
1898  int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1899
1900  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1901  bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1902
1903  Label L_fallthrough;
1904  int label_nulls = 0;
1905  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1906  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1907  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1908  assert(label_nulls <= 1 ||
1909         (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1910         "at most one NULL in the batch, usually");
1911
1912  // If the pointers are equal, we are done (e.g., String[] elements).
1913  // This self-check enables sharing of secondary supertype arrays among
1914  // non-primary types such as array-of-interface. Otherwise, each such
1915  // type would need its own customized SSA.
1916  // We move this check to the front of the fast path because many
1917  // type checks are in fact trivially successful in this manner,
1918  // so we get a nicely predicted branch right at the start of the check.
1919  cmpd(CCR0, sub_klass, super_klass);
1920  beq(CCR0, *L_success);
1921
1922  // Check the supertype display:
1923  if (must_load_sco) {
1924    // The super check offset is always positive...
1925    lwz(check_cache_offset, sco_offset, super_klass);
1926    super_check_offset = RegisterOrConstant(check_cache_offset);
1927    // super_check_offset is register.
1928    assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1929  }
1930  // The loaded value is the offset from KlassOopDesc.
1931
1932  ld(cached_super, super_check_offset, sub_klass);
1933  cmpd(CCR0, cached_super, super_klass);
1934
1935  // This check has worked decisively for primary supers.
1936  // Secondary supers are sought in the super_cache ('super_cache_addr').
1937  // (Secondary supers are interfaces and very deeply nested subtypes.)
1938  // This works in the same check above because of a tricky aliasing
1939  // between the super_cache and the primary super display elements.
1940  // (The 'super_check_addr' can address either, as the case requires.)
1941  // Note that the cache is updated below if it does not help us find
1942  // what we need immediately.
1943  // So if it was a primary super, we can just fail immediately.
1944  // Otherwise, it's the slow path for us (no success at this point).
1945
1946#define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1947
1948  if (super_check_offset.is_register()) {
1949    beq(CCR0, *L_success);
1950    cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1951    if (L_failure == &L_fallthrough) {
1952      beq(CCR0, *L_slow_path);
1953    } else {
1954      bne(CCR0, *L_failure);
1955      FINAL_JUMP(*L_slow_path);
1956    }
1957  } else {
1958    if (super_check_offset.as_constant() == sc_offset) {
1959      // Need a slow path; fast failure is impossible.
1960      if (L_slow_path == &L_fallthrough) {
1961        beq(CCR0, *L_success);
1962      } else {
1963        bne(CCR0, *L_slow_path);
1964        FINAL_JUMP(*L_success);
1965      }
1966    } else {
1967      // No slow path; it's a fast decision.
1968      if (L_failure == &L_fallthrough) {
1969        beq(CCR0, *L_success);
1970      } else {
1971        bne(CCR0, *L_failure);
1972        FINAL_JUMP(*L_success);
1973      }
1974    }
1975  }
1976
1977  bind(L_fallthrough);
1978#undef FINAL_JUMP
1979}
1980
1981void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1982                                                   Register super_klass,
1983                                                   Register temp1_reg,
1984                                                   Register temp2_reg,
1985                                                   Label* L_success,
1986                                                   Register result_reg) {
1987  const Register array_ptr = temp1_reg; // current value from cache array
1988  const Register temp      = temp2_reg;
1989
1990  assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1991
1992  int source_offset = in_bytes(Klass::secondary_supers_offset());
1993  int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1994
1995  int length_offset = Array<Klass*>::length_offset_in_bytes();
1996  int base_offset   = Array<Klass*>::base_offset_in_bytes();
1997
1998  Label hit, loop, failure, fallthru;
1999
2000  ld(array_ptr, source_offset, sub_klass);
2001
2002  // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2003  lwz(temp, length_offset, array_ptr);
2004  cmpwi(CCR0, temp, 0);
2005  beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2006
2007  mtctr(temp); // load ctr
2008
2009  bind(loop);
2010  // Oops in table are NO MORE compressed.
2011  ld(temp, base_offset, array_ptr);
2012  cmpd(CCR0, temp, super_klass);
2013  beq(CCR0, hit);
2014  addi(array_ptr, array_ptr, BytesPerWord);
2015  bdnz(loop);
2016
2017  bind(failure);
2018  if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2019  b(fallthru);
2020
2021  bind(hit);
2022  std(super_klass, target_offset, sub_klass); // save result to cache
2023  if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2024  if (L_success != NULL) { b(*L_success); }
2025  else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2026
2027  bind(fallthru);
2028}
2029
2030// Try fast path, then go to slow one if not successful
2031void MacroAssembler::check_klass_subtype(Register sub_klass,
2032                         Register super_klass,
2033                         Register temp1_reg,
2034                         Register temp2_reg,
2035                         Label& L_success) {
2036  Label L_failure;
2037  check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2038  check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2039  bind(L_failure); // Fallthru if not successful.
2040}
2041
2042void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
2043                                              Register temp_reg,
2044                                              Label& wrong_method_type) {
2045  assert_different_registers(mtype_reg, mh_reg, temp_reg);
2046  // Compare method type against that of the receiver.
2047  load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
2048  cmpd(CCR0, temp_reg, mtype_reg);
2049  bne(CCR0, wrong_method_type);
2050}
2051
2052RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2053                                                   Register temp_reg,
2054                                                   int extra_slot_offset) {
2055  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2056  int stackElementSize = Interpreter::stackElementSize;
2057  int offset = extra_slot_offset * stackElementSize;
2058  if (arg_slot.is_constant()) {
2059    offset += arg_slot.as_constant() * stackElementSize;
2060    return offset;
2061  } else {
2062    assert(temp_reg != noreg, "must specify");
2063    sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2064    if (offset != 0)
2065      addi(temp_reg, temp_reg, offset);
2066    return temp_reg;
2067  }
2068}
2069
2070// Supports temp2_reg = R0.
2071void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2072                                          Register mark_reg, Register temp_reg,
2073                                          Register temp2_reg, Label& done, Label* slow_case) {
2074  assert(UseBiasedLocking, "why call this otherwise?");
2075
2076#ifdef ASSERT
2077  assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2078#endif
2079
2080  Label cas_label;
2081
2082  // Branch to done if fast path fails and no slow_case provided.
2083  Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2084
2085  // Biased locking
2086  // See whether the lock is currently biased toward our thread and
2087  // whether the epoch is still valid
2088  // Note that the runtime guarantees sufficient alignment of JavaThread
2089  // pointers to allow age to be placed into low bits
2090  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
2091         "biased locking makes assumptions about bit layout");
2092
2093  if (PrintBiasedLockingStatistics) {
2094    load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2095    lwzx(temp_reg, temp2_reg);
2096    addi(temp_reg, temp_reg, 1);
2097    stwx(temp_reg, temp2_reg);
2098  }
2099
2100  andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
2101  cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2102  bne(cr_reg, cas_label);
2103
2104  load_klass(temp_reg, obj_reg);
2105
2106  load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
2107  ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2108  orr(temp_reg, R16_thread, temp_reg);
2109  xorr(temp_reg, mark_reg, temp_reg);
2110  andr(temp_reg, temp_reg, temp2_reg);
2111  cmpdi(cr_reg, temp_reg, 0);
2112  if (PrintBiasedLockingStatistics) {
2113    Label l;
2114    bne(cr_reg, l);
2115    load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2116    lwzx(mark_reg, temp2_reg);
2117    addi(mark_reg, mark_reg, 1);
2118    stwx(mark_reg, temp2_reg);
2119    // restore mark_reg
2120    ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2121    bind(l);
2122  }
2123  beq(cr_reg, done);
2124
2125  Label try_revoke_bias;
2126  Label try_rebias;
2127
2128  // At this point we know that the header has the bias pattern and
2129  // that we are not the bias owner in the current epoch. We need to
2130  // figure out more details about the state of the header in order to
2131  // know what operations can be legally performed on the object's
2132  // header.
2133
2134  // If the low three bits in the xor result aren't clear, that means
2135  // the prototype header is no longer biased and we have to revoke
2136  // the bias on this object.
2137  andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2138  cmpwi(cr_reg, temp2_reg, 0);
2139  bne(cr_reg, try_revoke_bias);
2140
2141  // Biasing is still enabled for this data type. See whether the
2142  // epoch of the current bias is still valid, meaning that the epoch
2143  // bits of the mark word are equal to the epoch bits of the
2144  // prototype header. (Note that the prototype header's epoch bits
2145  // only change at a safepoint.) If not, attempt to rebias the object
2146  // toward the current thread. Note that we must be absolutely sure
2147  // that the current epoch is invalid in order to do this because
2148  // otherwise the manipulations it performs on the mark word are
2149  // illegal.
2150
2151  int shift_amount = 64 - markOopDesc::epoch_shift;
2152  // rotate epoch bits to right (little) end and set other bits to 0
2153  // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2154  rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
2155  // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2156  bne(CCR0, try_rebias);
2157
2158  // The epoch of the current bias is still valid but we know nothing
2159  // about the owner; it might be set or it might be clear. Try to
2160  // acquire the bias of the object using an atomic operation. If this
2161  // fails we will go in to the runtime to revoke the object's bias.
2162  // Note that we first construct the presumed unbiased header so we
2163  // don't accidentally blow away another thread's valid bias.
2164  andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
2165                                markOopDesc::age_mask_in_place |
2166                                markOopDesc::epoch_mask_in_place));
2167  orr(temp_reg, R16_thread, mark_reg);
2168
2169  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2170
2171  // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2172  cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2173           /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2174           /*where=*/obj_reg,
2175           MacroAssembler::MemBarAcq,
2176           MacroAssembler::cmpxchgx_hint_acquire_lock(),
2177           noreg, slow_case_int); // bail out if failed
2178
2179  // If the biasing toward our thread failed, this means that
2180  // another thread succeeded in biasing it toward itself and we
2181  // need to revoke that bias. The revocation will occur in the
2182  // interpreter runtime in the slow case.
2183  if (PrintBiasedLockingStatistics) {
2184    load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2185    lwzx(temp_reg, temp2_reg);
2186    addi(temp_reg, temp_reg, 1);
2187    stwx(temp_reg, temp2_reg);
2188  }
2189  b(done);
2190
2191  bind(try_rebias);
2192  // At this point we know the epoch has expired, meaning that the
2193  // current "bias owner", if any, is actually invalid. Under these
2194  // circumstances _only_, we are allowed to use the current header's
2195  // value as the comparison value when doing the cas to acquire the
2196  // bias in the current epoch. In other words, we allow transfer of
2197  // the bias from one thread to another directly in this situation.
2198  load_klass(temp_reg, obj_reg);
2199  andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2200  orr(temp2_reg, R16_thread, temp2_reg);
2201  ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2202  orr(temp_reg, temp2_reg, temp_reg);
2203
2204  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2205
2206  cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2207                 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2208                 /*where=*/obj_reg,
2209                 MacroAssembler::MemBarAcq,
2210                 MacroAssembler::cmpxchgx_hint_acquire_lock(),
2211                 noreg, slow_case_int); // bail out if failed
2212
2213  // If the biasing toward our thread failed, this means that
2214  // another thread succeeded in biasing it toward itself and we
2215  // need to revoke that bias. The revocation will occur in the
2216  // interpreter runtime in the slow case.
2217  if (PrintBiasedLockingStatistics) {
2218    load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2219    lwzx(temp_reg, temp2_reg);
2220    addi(temp_reg, temp_reg, 1);
2221    stwx(temp_reg, temp2_reg);
2222  }
2223  b(done);
2224
2225  bind(try_revoke_bias);
2226  // The prototype mark in the klass doesn't have the bias bit set any
2227  // more, indicating that objects of this data type are not supposed
2228  // to be biased any more. We are going to try to reset the mark of
2229  // this object to the prototype value and fall through to the
2230  // CAS-based locking scheme. Note that if our CAS fails, it means
2231  // that another thread raced us for the privilege of revoking the
2232  // bias of this particular object, so it's okay to continue in the
2233  // normal locking code.
2234  load_klass(temp_reg, obj_reg);
2235  ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2236  andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2237  orr(temp_reg, temp_reg, temp2_reg);
2238
2239  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2240
2241  // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2242  cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2243                 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2244                 /*where=*/obj_reg,
2245                 MacroAssembler::MemBarAcq,
2246                 MacroAssembler::cmpxchgx_hint_acquire_lock());
2247
2248  // reload markOop in mark_reg before continuing with lightweight locking
2249  ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2250
2251  // Fall through to the normal CAS-based lock, because no matter what
2252  // the result of the above CAS, some thread must have succeeded in
2253  // removing the bias bit from the object's header.
2254  if (PrintBiasedLockingStatistics) {
2255    Label l;
2256    bne(cr_reg, l);
2257    load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2258    lwzx(temp_reg, temp2_reg);
2259    addi(temp_reg, temp_reg, 1);
2260    stwx(temp_reg, temp2_reg);
2261    bind(l);
2262  }
2263
2264  bind(cas_label);
2265}
2266
2267void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2268  // Check for biased locking unlock case, which is a no-op
2269  // Note: we do not have to check the thread ID for two reasons.
2270  // First, the interpreter checks for IllegalMonitorStateException at
2271  // a higher level. Second, if the bias was revoked while we held the
2272  // lock, the object could not be rebiased toward another thread, so
2273  // the bias bit would be clear.
2274
2275  ld(temp_reg, 0, mark_addr);
2276  andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2277
2278  cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2279  beq(cr_reg, done);
2280}
2281
2282// allocation (for C1)
2283void MacroAssembler::eden_allocate(
2284  Register obj,                      // result: pointer to object after successful allocation
2285  Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2286  int      con_size_in_bytes,        // object size in bytes if   known at compile time
2287  Register t1,                       // temp register
2288  Register t2,                       // temp register
2289  Label&   slow_case                 // continuation point if fast allocation fails
2290) {
2291  b(slow_case);
2292}
2293
2294void MacroAssembler::tlab_allocate(
2295  Register obj,                      // result: pointer to object after successful allocation
2296  Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2297  int      con_size_in_bytes,        // object size in bytes if   known at compile time
2298  Register t1,                       // temp register
2299  Label&   slow_case                 // continuation point if fast allocation fails
2300) {
2301  // make sure arguments make sense
2302  assert_different_registers(obj, var_size_in_bytes, t1);
2303  assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
2304  assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2305
2306  const Register new_top = t1;
2307  //verify_tlab(); not implemented
2308
2309  ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2310  ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2311  if (var_size_in_bytes == noreg) {
2312    addi(new_top, obj, con_size_in_bytes);
2313  } else {
2314    add(new_top, obj, var_size_in_bytes);
2315  }
2316  cmpld(CCR0, new_top, R0);
2317  bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2318
2319#ifdef ASSERT
2320  // make sure new free pointer is properly aligned
2321  {
2322    Label L;
2323    andi_(R0, new_top, MinObjAlignmentInBytesMask);
2324    beq(CCR0, L);
2325    stop("updated TLAB free is not properly aligned", 0x934);
2326    bind(L);
2327  }
2328#endif // ASSERT
2329
2330  // update the tlab top pointer
2331  std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2332  //verify_tlab(); not implemented
2333}
2334void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) {
2335  unimplemented("tlab_refill");
2336}
2337void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2338  unimplemented("incr_allocated_bytes");
2339}
2340
2341address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2342                                             int insts_call_instruction_offset, Register Rtoc) {
2343  // Start the stub.
2344  address stub = start_a_stub(64);
2345  if (stub == NULL) { return NULL; } // CodeCache full: bail out
2346
2347  // Create a trampoline stub relocation which relates this trampoline stub
2348  // with the call instruction at insts_call_instruction_offset in the
2349  // instructions code-section.
2350  relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2351  const int stub_start_offset = offset();
2352
2353  // For java_to_interp stubs we use R11_scratch1 as scratch register
2354  // and in call trampoline stubs we use R12_scratch2. This way we
2355  // can distinguish them (see is_NativeCallTrampolineStub_at()).
2356  Register reg_scratch = R12_scratch2;
2357
2358  // Now, create the trampoline stub's code:
2359  // - load the TOC
2360  // - load the call target from the constant pool
2361  // - call
2362  if (Rtoc == noreg) {
2363    calculate_address_from_global_toc(reg_scratch, method_toc());
2364    Rtoc = reg_scratch;
2365  }
2366
2367  ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2368  mtctr(reg_scratch);
2369  bctr();
2370
2371  const address stub_start_addr = addr_at(stub_start_offset);
2372
2373  // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2374  assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2375         "encoded offset into the constant pool must match");
2376  // Trampoline_stub_size should be good.
2377  assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2378  assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2379
2380  // End the stub.
2381  end_a_stub();
2382  return stub;
2383}
2384
2385// TM on PPC64.
2386void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2387  Label retry;
2388  bind(retry);
2389  ldarx(result, addr, /*hint*/ false);
2390  addi(result, result, simm16);
2391  stdcx_(result, addr);
2392  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2393    bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2394  } else {
2395    bne(                  CCR0, retry); // stXcx_ sets CCR0
2396  }
2397}
2398
2399void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2400  Label retry;
2401  bind(retry);
2402  lwarx(result, addr, /*hint*/ false);
2403  ori(result, result, uimm16);
2404  stwcx_(result, addr);
2405  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2406    bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2407  } else {
2408    bne(                  CCR0, retry); // stXcx_ sets CCR0
2409  }
2410}
2411
2412#if INCLUDE_RTM_OPT
2413
2414// Update rtm_counters based on abort status
2415// input: abort_status
2416//        rtm_counters (RTMLockingCounters*)
2417void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2418  // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2419  // x86 ppc (! means inverted, ? means not the same)
2420  //  0   31  Set if abort caused by XABORT instruction.
2421  //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2422  //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2423  //  3   10  Set if an internal buffer overflowed.
2424  //  4  ?12  Set if a debug breakpoint was hit.
2425  //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2426  const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2427                                 Assembler::tm_failure_persistent, // inverted: transient
2428                                 Assembler::tm_trans_cf,
2429                                 Assembler::tm_footprint_of,
2430                                 Assembler::tm_non_trans_cf,
2431                                 Assembler::tm_suspended};
2432  const bool tm_failure_inv[] = {false, true, false, false, false, false};
2433  assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2434
2435  const Register addr_Reg = R0;
2436  // Keep track of offset to where rtm_counters_Reg had pointed to.
2437  int counters_offs = RTMLockingCounters::abort_count_offset();
2438  addi(addr_Reg, rtm_counters_Reg, counters_offs);
2439  const Register temp_Reg = rtm_counters_Reg;
2440
2441  //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2442  ldx(temp_Reg, addr_Reg);
2443  addi(temp_Reg, temp_Reg, 1);
2444  stdx(temp_Reg, addr_Reg);
2445
2446  if (PrintPreciseRTMLockingStatistics) {
2447    int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2448
2449    //mftexasr(abort_status); done by caller
2450    for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2451      counters_offs += counters_offs_delta;
2452      li(temp_Reg, counters_offs_delta); // can't use addi with R0
2453      add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2454      counters_offs_delta = sizeof(uintx);
2455
2456      Label check_abort;
2457      rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2458      if (tm_failure_inv[i]) {
2459        bne(CCR0, check_abort);
2460      } else {
2461        beq(CCR0, check_abort);
2462      }
2463      //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2464      ldx(temp_Reg, addr_Reg);
2465      addi(temp_Reg, temp_Reg, 1);
2466      stdx(temp_Reg, addr_Reg);
2467      bind(check_abort);
2468    }
2469  }
2470  li(temp_Reg, -counters_offs); // can't use addi with R0
2471  add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2472}
2473
2474// Branch if (random & (count-1) != 0), count is 2^n
2475// tmp and CR0 are killed
2476void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2477  mftb(tmp);
2478  andi_(tmp, tmp, count-1);
2479  bne(CCR0, brLabel);
2480}
2481
2482// Perform abort ratio calculation, set no_rtm bit if high ratio.
2483// input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2484void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2485                                                 RTMLockingCounters* rtm_counters,
2486                                                 Metadata* method_data) {
2487  Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2488
2489  if (RTMLockingCalculationDelay > 0) {
2490    // Delay calculation.
2491    ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2492    cmpdi(CCR0, rtm_counters_Reg, 0);
2493    beq(CCR0, L_done);
2494    load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2495  }
2496  // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2497  //   Aborted transactions = abort_count * 100
2498  //   All transactions = total_count *  RTMTotalCountIncrRate
2499  //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2500  ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2501  cmpdi(CCR0, R0, RTMAbortThreshold);
2502  blt(CCR0, L_check_always_rtm2);
2503  mulli(R0, R0, 100);
2504
2505  const Register tmpReg = rtm_counters_Reg;
2506  ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2507  mulli(tmpReg, tmpReg, RTMTotalCountIncrRate);
2508  mulli(tmpReg, tmpReg, RTMAbortRatio);
2509  cmpd(CCR0, R0, tmpReg);
2510  blt(CCR0, L_check_always_rtm1); // jump to reload
2511  if (method_data != NULL) {
2512    // Set rtm_state to "no rtm" in MDO.
2513    // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2514    // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2515    load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2516    atomic_ori_int(R0, tmpReg, NoRTM);
2517  }
2518  b(L_done);
2519
2520  bind(L_check_always_rtm1);
2521  load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2522  bind(L_check_always_rtm2);
2523  ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2524  cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
2525  blt(CCR0, L_done);
2526  if (method_data != NULL) {
2527    // Set rtm_state to "always rtm" in MDO.
2528    // Not using a metadata relocation. See above.
2529    load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2530    atomic_ori_int(R0, tmpReg, UseRTM);
2531  }
2532  bind(L_done);
2533}
2534
2535// Update counters and perform abort ratio calculation.
2536// input: abort_status_Reg
2537void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2538                                   RTMLockingCounters* rtm_counters,
2539                                   Metadata* method_data,
2540                                   bool profile_rtm) {
2541
2542  assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2543  // Update rtm counters based on state at abort.
2544  // Reads abort_status_Reg, updates flags.
2545  assert_different_registers(abort_status_Reg, temp_Reg);
2546  load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2547  rtm_counters_update(abort_status_Reg, temp_Reg);
2548  if (profile_rtm) {
2549    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2550    rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2551  }
2552}
2553
2554// Retry on abort if abort's status indicates non-persistent failure.
2555// inputs: retry_count_Reg
2556//       : abort_status_Reg
2557// output: retry_count_Reg decremented by 1
2558void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2559                                             Label& retryLabel, Label* checkRetry) {
2560  Label doneRetry;
2561  rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2562  bne(CCR0, doneRetry);
2563  if (checkRetry) { bind(*checkRetry); }
2564  addic_(retry_count_Reg, retry_count_Reg, -1);
2565  blt(CCR0, doneRetry);
2566  smt_yield(); // Can't use wait(). No permission (SIGILL).
2567  b(retryLabel);
2568  bind(doneRetry);
2569}
2570
2571// Spin and retry if lock is busy.
2572// inputs: owner_addr_Reg (monitor address)
2573//       : retry_count_Reg
2574// output: retry_count_Reg decremented by 1
2575// CTR is killed
2576void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2577  Label SpinLoop, doneRetry;
2578  addic_(retry_count_Reg, retry_count_Reg, -1);
2579  blt(CCR0, doneRetry);
2580
2581  if (RTMSpinLoopCount > 1) {
2582    li(R0, RTMSpinLoopCount);
2583    mtctr(R0);
2584  }
2585
2586  bind(SpinLoop);
2587  smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2588
2589  if (RTMSpinLoopCount > 1) {
2590    bdz(retryLabel);
2591    ld(R0, 0, owner_addr_Reg);
2592    cmpdi(CCR0, R0, 0);
2593    bne(CCR0, SpinLoop);
2594  }
2595
2596  b(retryLabel);
2597
2598  bind(doneRetry);
2599}
2600
2601// Use RTM for normal stack locks.
2602// Input: objReg (object to lock)
2603void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2604                                       Register obj, Register mark_word, Register tmp,
2605                                       Register retry_on_abort_count_Reg,
2606                                       RTMLockingCounters* stack_rtm_counters,
2607                                       Metadata* method_data, bool profile_rtm,
2608                                       Label& DONE_LABEL, Label& IsInflated) {
2609  assert(UseRTMForStackLocks, "why call this otherwise?");
2610  assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2611  Label L_rtm_retry, L_decrement_retry, L_on_abort;
2612
2613  if (RTMRetryCount > 0) {
2614    load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2615    bind(L_rtm_retry);
2616  }
2617  andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2618  bne(CCR0, IsInflated);
2619
2620  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2621    Label L_noincrement;
2622    if (RTMTotalCountIncrRate > 1) {
2623      branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement);
2624    }
2625    assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2626    load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2627    //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2628    ldx(mark_word, tmp);
2629    addi(mark_word, mark_word, 1);
2630    stdx(mark_word, tmp);
2631    bind(L_noincrement);
2632  }
2633  tbegin_();
2634  beq(CCR0, L_on_abort);
2635  ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2636  andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2637  cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2638  beq(flag, DONE_LABEL);                                       // all done if unlocked
2639
2640  if (UseRTMXendForLockBusy) {
2641    tend_();
2642    b(L_decrement_retry);
2643  } else {
2644    tabort_();
2645  }
2646  bind(L_on_abort);
2647  const Register abort_status_Reg = tmp;
2648  mftexasr(abort_status_Reg);
2649  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2650    rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2651  }
2652  ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2653  if (RTMRetryCount > 0) {
2654    // Retry on lock abort if abort status is not permanent.
2655    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2656  } else {
2657    bind(L_decrement_retry);
2658  }
2659}
2660
2661// Use RTM for inflating locks
2662// inputs: obj       (object to lock)
2663//         mark_word (current header - KILLED)
2664//         boxReg    (on-stack box address (displaced header location) - KILLED)
2665void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2666                                          Register obj, Register mark_word, Register boxReg,
2667                                          Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2668                                          RTMLockingCounters* rtm_counters,
2669                                          Metadata* method_data, bool profile_rtm,
2670                                          Label& DONE_LABEL) {
2671  assert(UseRTMLocking, "why call this otherwise?");
2672  Label L_rtm_retry, L_decrement_retry, L_on_abort;
2673  // Clean monitor_value bit to get valid pointer.
2674  int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2675
2676  // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2677  std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2678  const Register tmpReg = boxReg;
2679  const Register owner_addr_Reg = mark_word;
2680  addi(owner_addr_Reg, mark_word, owner_offset);
2681
2682  if (RTMRetryCount > 0) {
2683    load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2684    load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2685    bind(L_rtm_retry);
2686  }
2687  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2688    Label L_noincrement;
2689    if (RTMTotalCountIncrRate > 1) {
2690      branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement);
2691    }
2692    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2693    load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2694    //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2695    ldx(tmpReg, R0);
2696    addi(tmpReg, tmpReg, 1);
2697    stdx(tmpReg, R0);
2698    bind(L_noincrement);
2699  }
2700  tbegin_();
2701  beq(CCR0, L_on_abort);
2702  // We don't reload mark word. Will only be reset at safepoint.
2703  ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2704  cmpdi(flag, R0, 0);
2705  beq(flag, DONE_LABEL);
2706
2707  if (UseRTMXendForLockBusy) {
2708    tend_();
2709    b(L_decrement_retry);
2710  } else {
2711    tabort_();
2712  }
2713  bind(L_on_abort);
2714  const Register abort_status_Reg = tmpReg;
2715  mftexasr(abort_status_Reg);
2716  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2717    rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2718    // Restore owner_addr_Reg
2719    ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2720#ifdef ASSERT
2721    andi_(R0, mark_word, markOopDesc::monitor_value);
2722    asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2723#endif
2724    addi(owner_addr_Reg, mark_word, owner_offset);
2725  }
2726  if (RTMRetryCount > 0) {
2727    // Retry on lock abort if abort status is not permanent.
2728    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2729  }
2730
2731  // Appears unlocked - try to swing _owner from null to non-null.
2732  cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2733           MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2734           MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2735
2736  if (RTMRetryCount > 0) {
2737    // success done else retry
2738    b(DONE_LABEL);
2739    bind(L_decrement_retry);
2740    // Spin and retry if lock is busy.
2741    rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2742  } else {
2743    bind(L_decrement_retry);
2744  }
2745}
2746
2747#endif //  INCLUDE_RTM_OPT
2748
2749// "The box" is the space on the stack where we copy the object mark.
2750void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2751                                               Register temp, Register displaced_header, Register current_header,
2752                                               bool try_bias,
2753                                               RTMLockingCounters* rtm_counters,
2754                                               RTMLockingCounters* stack_rtm_counters,
2755                                               Metadata* method_data,
2756                                               bool use_rtm, bool profile_rtm) {
2757  assert_different_registers(oop, box, temp, displaced_header, current_header);
2758  assert(flag != CCR0, "bad condition register");
2759  Label cont;
2760  Label object_has_monitor;
2761  Label cas_failed;
2762
2763  // Load markOop from object into displaced_header.
2764  ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2765
2766
2767  // Always do locking in runtime.
2768  if (EmitSync & 0x01) {
2769    cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2770    return;
2771  }
2772
2773  if (try_bias) {
2774    biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2775  }
2776
2777#if INCLUDE_RTM_OPT
2778  if (UseRTMForStackLocks && use_rtm) {
2779    rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2780                      stack_rtm_counters, method_data, profile_rtm,
2781                      cont, object_has_monitor);
2782  }
2783#endif // INCLUDE_RTM_OPT
2784
2785  // Handle existing monitor.
2786  if ((EmitSync & 0x02) == 0) {
2787    // The object has an existing monitor iff (mark & monitor_value) != 0.
2788    andi_(temp, displaced_header, markOopDesc::monitor_value);
2789    bne(CCR0, object_has_monitor);
2790  }
2791
2792  // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2793  ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2794
2795  // Load Compare Value application register.
2796
2797  // Initialize the box. (Must happen before we update the object mark!)
2798  std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2799
2800  // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2801  // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2802  cmpxchgd(/*flag=*/flag,
2803           /*current_value=*/current_header,
2804           /*compare_value=*/displaced_header,
2805           /*exchange_value=*/box,
2806           /*where=*/oop,
2807           MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2808           MacroAssembler::cmpxchgx_hint_acquire_lock(),
2809           noreg,
2810           &cas_failed,
2811           /*check without membar and ldarx first*/true);
2812  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2813
2814  // If the compare-and-exchange succeeded, then we found an unlocked
2815  // object and we have now locked it.
2816  b(cont);
2817
2818  bind(cas_failed);
2819  // We did not see an unlocked object so try the fast recursive case.
2820
2821  // Check if the owner is self by comparing the value in the markOop of object
2822  // (current_header) with the stack pointer.
2823  sub(current_header, current_header, R1_SP);
2824  load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2825
2826  and_(R0/*==0?*/, current_header, temp);
2827  // If condition is true we are cont and hence we can store 0 as the
2828  // displaced header in the box, which indicates that it is a recursive lock.
2829  mcrf(flag,CCR0);
2830  std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2831
2832  // Handle existing monitor.
2833  if ((EmitSync & 0x02) == 0) {
2834    b(cont);
2835
2836    bind(object_has_monitor);
2837    // The object's monitor m is unlocked iff m->owner == NULL,
2838    // otherwise m->owner may contain a thread or a stack address.
2839
2840#if INCLUDE_RTM_OPT
2841    // Use the same RTM locking code in 32- and 64-bit VM.
2842    if (use_rtm) {
2843      rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2844                           rtm_counters, method_data, profile_rtm, cont);
2845    } else {
2846#endif // INCLUDE_RTM_OPT
2847
2848    // Try to CAS m->owner from NULL to current thread.
2849    addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2850    cmpxchgd(/*flag=*/flag,
2851             /*current_value=*/current_header,
2852             /*compare_value=*/(intptr_t)0,
2853             /*exchange_value=*/R16_thread,
2854             /*where=*/temp,
2855             MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2856             MacroAssembler::cmpxchgx_hint_acquire_lock());
2857
2858    // Store a non-null value into the box.
2859    std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2860
2861#   ifdef ASSERT
2862    bne(flag, cont);
2863    // We have acquired the monitor, check some invariants.
2864    addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2865    // Invariant 1: _recursions should be 0.
2866    //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2867    asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2868                            "monitor->_recursions should be 0", -1);
2869    // Invariant 2: OwnerIsThread shouldn't be 0.
2870    //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
2871    //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
2872    //                           "monitor->OwnerIsThread shouldn't be 0", -1);
2873#   endif
2874
2875#if INCLUDE_RTM_OPT
2876    } // use_rtm()
2877#endif
2878  }
2879
2880  bind(cont);
2881  // flag == EQ indicates success
2882  // flag == NE indicates failure
2883}
2884
2885void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2886                                                 Register temp, Register displaced_header, Register current_header,
2887                                                 bool try_bias, bool use_rtm) {
2888  assert_different_registers(oop, box, temp, displaced_header, current_header);
2889  assert(flag != CCR0, "bad condition register");
2890  Label cont;
2891  Label object_has_monitor;
2892
2893  // Always do locking in runtime.
2894  if (EmitSync & 0x01) {
2895    cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2896    return;
2897  }
2898
2899  if (try_bias) {
2900    biased_locking_exit(flag, oop, current_header, cont);
2901  }
2902
2903#if INCLUDE_RTM_OPT
2904  if (UseRTMForStackLocks && use_rtm) {
2905    assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2906    Label L_regular_unlock;
2907    ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2908    andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2909    cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2910    bne(flag, L_regular_unlock);                                      // else RegularLock
2911    tend_();                                                          // otherwise end...
2912    b(cont);                                                          // ... and we're done
2913    bind(L_regular_unlock);
2914  }
2915#endif
2916
2917  // Find the lock address and load the displaced header from the stack.
2918  ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2919
2920  // If the displaced header is 0, we have a recursive unlock.
2921  cmpdi(flag, displaced_header, 0);
2922  beq(flag, cont);
2923
2924  // Handle existing monitor.
2925  if ((EmitSync & 0x02) == 0) {
2926    // The object has an existing monitor iff (mark & monitor_value) != 0.
2927    RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2928    ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2929    andi_(R0, current_header, markOopDesc::monitor_value);
2930    bne(CCR0, object_has_monitor);
2931  }
2932
2933  // Check if it is still a light weight lock, this is is true if we see
2934  // the stack address of the basicLock in the markOop of the object.
2935  // Cmpxchg sets flag to cmpd(current_header, box).
2936  cmpxchgd(/*flag=*/flag,
2937           /*current_value=*/current_header,
2938           /*compare_value=*/box,
2939           /*exchange_value=*/displaced_header,
2940           /*where=*/oop,
2941           MacroAssembler::MemBarRel,
2942           MacroAssembler::cmpxchgx_hint_release_lock(),
2943           noreg,
2944           &cont);
2945
2946  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2947
2948  // Handle existing monitor.
2949  if ((EmitSync & 0x02) == 0) {
2950    b(cont);
2951
2952    bind(object_has_monitor);
2953    addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2954    ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2955
2956    // It's inflated.
2957#if INCLUDE_RTM_OPT
2958    if (use_rtm) {
2959      Label L_regular_inflated_unlock;
2960      // Clean monitor_value bit to get valid pointer
2961      cmpdi(flag, temp, 0);
2962      bne(flag, L_regular_inflated_unlock);
2963      tend_();
2964      b(cont);
2965      bind(L_regular_inflated_unlock);
2966    }
2967#endif
2968
2969    ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2970    xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2971    orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2972    cmpdi(flag, temp, 0);
2973    bne(flag, cont);
2974
2975    ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2976    ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2977    orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2978    cmpdi(flag, temp, 0);
2979    bne(flag, cont);
2980    release();
2981    std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2982  }
2983
2984  bind(cont);
2985  // flag == EQ indicates success
2986  // flag == NE indicates failure
2987}
2988
2989// Write serialization page so VM thread can do a pseudo remote membar.
2990// We use the current thread pointer to calculate a thread specific
2991// offset to write to within the page. This minimizes bus traffic
2992// due to cache line collision.
2993void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
2994  srdi(tmp2, thread, os::get_serialize_page_shift_count());
2995
2996  int mask = os::vm_page_size() - sizeof(int);
2997  if (Assembler::is_simm(mask, 16)) {
2998    andi(tmp2, tmp2, mask);
2999  } else {
3000    lis(tmp1, (int)((signed short) (mask >> 16)));
3001    ori(tmp1, tmp1, mask & 0x0000ffff);
3002    andr(tmp2, tmp2, tmp1);
3003  }
3004
3005  load_const(tmp1, (long) os::get_memory_serialize_page());
3006  release();
3007  stwx(R0, tmp1, tmp2);
3008}
3009
3010
3011// GC barrier helper macros
3012
3013// Write the card table byte if needed.
3014void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
3015  CardTableModRefBS* bs =
3016    barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
3017  assert(bs->kind() == BarrierSet::CardTableForRS ||
3018         bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
3019#ifdef ASSERT
3020  cmpdi(CCR0, Rnew_val, 0);
3021  asm_assert_ne("null oop not allowed", 0x321);
3022#endif
3023  card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
3024}
3025
3026// Write the card table byte.
3027void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
3028  assert_different_registers(Robj, Rtmp, R0);
3029  load_const_optimized(Rtmp, (address)byte_map_base, R0);
3030  srdi(Robj, Robj, CardTableModRefBS::card_shift);
3031  li(R0, 0); // dirty
3032  if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
3033  stbx(R0, Rtmp, Robj);
3034}
3035
3036// Kills R31 if value is a volatile register.
3037void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
3038  Label done;
3039  cmpdi(CCR0, value, 0);
3040  beq(CCR0, done);         // Use NULL as-is.
3041
3042  clrrdi(tmp1, value, JNIHandles::weak_tag_size);
3043#if INCLUDE_ALL_GCS
3044  if (UseG1GC) { andi_(tmp2, value, JNIHandles::weak_tag_mask); }
3045#endif
3046  ld(value, 0, tmp1);      // Resolve (untagged) jobject.
3047
3048#if INCLUDE_ALL_GCS
3049  if (UseG1GC) {
3050    Label not_weak;
3051    beq(CCR0, not_weak);   // Test for jweak tag.
3052    verify_oop(value);
3053    g1_write_barrier_pre(noreg, // obj
3054                         noreg, // offset
3055                         value, // pre_val
3056                         tmp1, tmp2, needs_frame);
3057    bind(not_weak);
3058  }
3059#endif // INCLUDE_ALL_GCS
3060  verify_oop(value);
3061  bind(done);
3062}
3063
3064#if INCLUDE_ALL_GCS
3065// General G1 pre-barrier generator.
3066// Goal: record the previous value if it is not null.
3067void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
3068                                          Register Rtmp1, Register Rtmp2, bool needs_frame) {
3069  Label runtime, filtered;
3070
3071  // Is marking active?
3072  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
3073    lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
3074  } else {
3075    guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
3076    lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
3077  }
3078  cmpdi(CCR0, Rtmp1, 0);
3079  beq(CCR0, filtered);
3080
3081  // Do we need to load the previous value?
3082  if (Robj != noreg) {
3083    // Load the previous value...
3084    if (UseCompressedOops) {
3085      lwz(Rpre_val, offset, Robj);
3086    } else {
3087      ld(Rpre_val, offset, Robj);
3088    }
3089    // Previous value has been loaded into Rpre_val.
3090  }
3091  assert(Rpre_val != noreg, "must have a real register");
3092
3093  // Is the previous value null?
3094  cmpdi(CCR0, Rpre_val, 0);
3095  beq(CCR0, filtered);
3096
3097  if (Robj != noreg && UseCompressedOops) {
3098    decode_heap_oop_not_null(Rpre_val);
3099  }
3100
3101  // OK, it's not filtered, so we'll need to call enqueue. In the normal
3102  // case, pre_val will be a scratch G-reg, but there are some cases in
3103  // which it's an O-reg. In the first case, do a normal call. In the
3104  // latter, do a save here and call the frameless version.
3105
3106  // Can we store original value in the thread's buffer?
3107  // Is index == 0?
3108  // (The index field is typed as size_t.)
3109  const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
3110
3111  ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
3112  cmpdi(CCR0, Rindex, 0);
3113  beq(CCR0, runtime); // If index == 0, goto runtime.
3114  ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread);
3115
3116  addi(Rindex, Rindex, -wordSize); // Decrement index.
3117  std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
3118
3119  // Record the previous value.
3120  stdx(Rpre_val, Rbuffer, Rindex);
3121  b(filtered);
3122
3123  bind(runtime);
3124
3125  // May need to preserve LR. Also needed if current frame is not compatible with C calling convention.
3126  if (needs_frame) {
3127    save_LR_CR(Rtmp1);
3128    push_frame_reg_args(0, Rtmp2);
3129  }
3130
3131  if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
3132  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
3133  if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
3134
3135  if (needs_frame) {
3136    pop_frame();
3137    restore_LR_CR(Rtmp1);
3138  }
3139
3140  bind(filtered);
3141}
3142
3143// General G1 post-barrier generator
3144// Store cross-region card.
3145void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
3146  Label runtime, filtered_int;
3147  Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
3148  assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
3149
3150  G1SATBCardTableLoggingModRefBS* bs =
3151    barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
3152
3153  // Does store cross heap regions?
3154  if (G1RSBarrierRegionFilter) {
3155    xorr(Rtmp1, Rstore_addr, Rnew_val);
3156    srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
3157    beq(CCR0, filtered);
3158  }
3159
3160  // Crosses regions, storing NULL?
3161#ifdef ASSERT
3162  cmpdi(CCR0, Rnew_val, 0);
3163  asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
3164  //beq(CCR0, filtered);
3165#endif
3166
3167  // Storing region crossing non-NULL, is card already dirty?
3168  assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
3169  const Register Rcard_addr = Rtmp1;
3170  Register Rbase = Rtmp2;
3171  load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
3172
3173  srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
3174
3175  // Get the address of the card.
3176  lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
3177  cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
3178  beq(CCR0, filtered);
3179
3180  membar(Assembler::StoreLoad);
3181  lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
3182  cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
3183  beq(CCR0, filtered);
3184
3185  // Storing a region crossing, non-NULL oop, card is clean.
3186  // Dirty card and log.
3187  li(Rtmp3, CardTableModRefBS::dirty_card_val());
3188  //release(); // G1: oops are allowed to get visible after dirty marking.
3189  stbx(Rtmp3, Rbase, Rcard_addr);
3190
3191  add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
3192  Rbase = noreg; // end of lifetime
3193
3194  const Register Rqueue_index = Rtmp2,
3195                 Rqueue_buf   = Rtmp3;
3196  ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
3197  cmpdi(CCR0, Rqueue_index, 0);
3198  beq(CCR0, runtime); // index == 0 then jump to runtime
3199  ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread);
3200
3201  addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
3202  std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
3203
3204  stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
3205  b(filtered);
3206
3207  bind(runtime);
3208
3209  // Save the live input values.
3210  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
3211
3212  bind(filtered_int);
3213}
3214#endif // INCLUDE_ALL_GCS
3215
3216// Values for last_Java_pc, and last_Java_sp must comply to the rules
3217// in frame_ppc.hpp.
3218void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3219  // Always set last_Java_pc and flags first because once last_Java_sp
3220  // is visible has_last_Java_frame is true and users will look at the
3221  // rest of the fields. (Note: flags should always be zero before we
3222  // get here so doesn't need to be set.)
3223
3224  // Verify that last_Java_pc was zeroed on return to Java
3225  asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3226                          "last_Java_pc not zeroed before leaving Java", 0x200);
3227
3228  // When returning from calling out from Java mode the frame anchor's
3229  // last_Java_pc will always be set to NULL. It is set here so that
3230  // if we are doing a call to native (not VM) that we capture the
3231  // known pc and don't have to rely on the native call having a
3232  // standard frame linkage where we can find the pc.
3233  if (last_Java_pc != noreg)
3234    std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3235
3236  // Set last_Java_sp last.
3237  std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3238}
3239
3240void MacroAssembler::reset_last_Java_frame(void) {
3241  asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3242                             R16_thread, "SP was not set, still zero", 0x202);
3243
3244  BLOCK_COMMENT("reset_last_Java_frame {");
3245  li(R0, 0);
3246
3247  // _last_Java_sp = 0
3248  std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3249
3250  // _last_Java_pc = 0
3251  std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3252  BLOCK_COMMENT("} reset_last_Java_frame");
3253}
3254
3255void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3256  assert_different_registers(sp, tmp1);
3257
3258  // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3259  // TOP_IJAVA_FRAME_ABI.
3260  // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3261  address entry = pc();
3262  load_const_optimized(tmp1, entry);
3263
3264  set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3265}
3266
3267void MacroAssembler::get_vm_result(Register oop_result) {
3268  // Read:
3269  //   R16_thread
3270  //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3271  //
3272  // Updated:
3273  //   oop_result
3274  //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3275
3276  verify_thread();
3277
3278  ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3279  li(R0, 0);
3280  std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3281
3282  verify_oop(oop_result);
3283}
3284
3285void MacroAssembler::get_vm_result_2(Register metadata_result) {
3286  // Read:
3287  //   R16_thread
3288  //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3289  //
3290  // Updated:
3291  //   metadata_result
3292  //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3293
3294  ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3295  li(R0, 0);
3296  std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3297}
3298
3299Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3300  Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3301  if (Universe::narrow_klass_base() != 0) {
3302    // Use dst as temp if it is free.
3303    sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3304    current = dst;
3305  }
3306  if (Universe::narrow_klass_shift() != 0) {
3307    srdi(dst, current, Universe::narrow_klass_shift());
3308    current = dst;
3309  }
3310  return current;
3311}
3312
3313void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3314  if (UseCompressedClassPointers) {
3315    Register compressedKlass = encode_klass_not_null(ck, klass);
3316    stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3317  } else {
3318    std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3319  }
3320}
3321
3322void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3323  if (UseCompressedClassPointers) {
3324    if (val == noreg) {
3325      val = R0;
3326      li(val, 0);
3327    }
3328    stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3329  }
3330}
3331
3332int MacroAssembler::instr_size_for_decode_klass_not_null() {
3333  if (!UseCompressedClassPointers) return 0;
3334  int num_instrs = 1;  // shift or move
3335  if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
3336  return num_instrs * BytesPerInstWord;
3337}
3338
3339void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3340  assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3341  if (src == noreg) src = dst;
3342  Register shifted_src = src;
3343  if (Universe::narrow_klass_shift() != 0 ||
3344      Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
3345    shifted_src = dst;
3346    sldi(shifted_src, src, Universe::narrow_klass_shift());
3347  }
3348  if (Universe::narrow_klass_base() != 0) {
3349    add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3350  }
3351}
3352
3353void MacroAssembler::load_klass(Register dst, Register src) {
3354  if (UseCompressedClassPointers) {
3355    lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3356    // Attention: no null check here!
3357    decode_klass_not_null(dst, dst);
3358  } else {
3359    ld(dst, oopDesc::klass_offset_in_bytes(), src);
3360  }
3361}
3362
3363void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) {
3364  ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
3365  ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3366  ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
3367}
3368
3369// Clear Array
3370// For very short arrays. tmp == R0 is allowed.
3371void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3372  if (cnt_dwords > 0) { li(tmp, 0); }
3373  for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3374}
3375
3376// Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3377void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3378  if (cnt_dwords < 8) {
3379    clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3380    return;
3381  }
3382
3383  Label loop;
3384  const long loopcnt   = cnt_dwords >> 1,
3385             remainder = cnt_dwords & 1;
3386
3387  li(tmp, loopcnt);
3388  mtctr(tmp);
3389  li(tmp, 0);
3390  bind(loop);
3391    std(tmp, 0, base_ptr);
3392    std(tmp, 8, base_ptr);
3393    addi(base_ptr, base_ptr, 16);
3394    bdnz(loop);
3395  if (remainder) { std(tmp, 0, base_ptr); }
3396}
3397
3398// Kills both input registers. tmp == R0 is allowed.
3399void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3400  // Procedure for large arrays (uses data cache block zero instruction).
3401    Label startloop, fast, fastloop, small_rest, restloop, done;
3402    const int cl_size         = VM_Version::L1_data_cache_line_size(),
3403              cl_dwords       = cl_size >> 3,
3404              cl_dw_addr_bits = exact_log2(cl_dwords),
3405              dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3406              min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3407
3408  if (const_cnt >= 0) {
3409    // Constant case.
3410    if (const_cnt < min_cnt) {
3411      clear_memory_constlen(base_ptr, const_cnt, tmp);
3412      return;
3413    }
3414    load_const_optimized(cnt_dwords, const_cnt, tmp);
3415  } else {
3416    // cnt_dwords already loaded in register. Need to check size.
3417    cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3418    blt(CCR1, small_rest);
3419  }
3420    rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3421    beq(CCR0, fast);                                  // Already 128byte aligned.
3422
3423    subfic(tmp, tmp, cl_dwords);
3424    mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3425    subf(cnt_dwords, tmp, cnt_dwords); // rest.
3426    li(tmp, 0);
3427
3428  bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3429    std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3430    addi(base_ptr, base_ptr, 8);
3431    bdnz(startloop);
3432
3433  bind(fast);                                  // Clear 128byte blocks.
3434    srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3435    andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3436    mtctr(tmp);                                // Load counter.
3437
3438  bind(fastloop);
3439    dcbz(base_ptr);                    // Clear 128byte aligned block.
3440    addi(base_ptr, base_ptr, cl_size);
3441    bdnz(fastloop);
3442
3443  bind(small_rest);
3444    cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3445    beq(CCR0, done);                   // rest == 0
3446    li(tmp, 0);
3447    mtctr(cnt_dwords);                 // Load counter.
3448
3449  bind(restloop);                      // Clear rest.
3450    std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3451    addi(base_ptr, base_ptr, 8);
3452    bdnz(restloop);
3453
3454  bind(done);
3455}
3456
3457/////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3458
3459#ifdef COMPILER2
3460// Intrinsics for CompactStrings
3461
3462// Compress char[] to byte[] by compressing 16 bytes at once.
3463void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
3464                                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
3465                                        Label& Lfailure) {
3466
3467  const Register tmp0 = R0;
3468  assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3469  Label Lloop, Lslow;
3470
3471  // Check if cnt >= 8 (= 16 bytes)
3472  lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF00FF00FF
3473  srwi_(tmp2, cnt, 3);
3474  beq(CCR0, Lslow);
3475  ori(tmp1, tmp1, 0xFF);
3476  rldimi(tmp1, tmp1, 32, 0);
3477  mtctr(tmp2);
3478
3479  // 2x unrolled loop
3480  bind(Lloop);
3481  ld(tmp2, 0, src);               // _0_1_2_3 (Big Endian)
3482  ld(tmp4, 8, src);               // _4_5_6_7
3483
3484  orr(tmp0, tmp2, tmp4);
3485  rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
3486  rldimi(tmp2, tmp2, 2*8, 2*8);   // _0_2_3_3
3487  rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
3488  rldimi(tmp4, tmp4, 2*8, 2*8);   // _4_6_7_7
3489
3490  andc_(tmp0, tmp0, tmp1);
3491  bne(CCR0, Lfailure);            // Not latin1.
3492  addi(src, src, 16);
3493
3494  rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
3495  srdi(tmp2, tmp2, 3*8);          // ____0_2_
3496  rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
3497  srdi(tmp4, tmp4, 3*8);          // ____4_6_
3498
3499  orr(tmp2, tmp2, tmp3);          // ____0123
3500  orr(tmp4, tmp4, tmp5);          // ____4567
3501
3502  stw(tmp2, 0, dst);
3503  stw(tmp4, 4, dst);
3504  addi(dst, dst, 8);
3505  bdnz(Lloop);
3506
3507  bind(Lslow);                    // Fallback to slow version
3508}
3509
3510// Compress char[] to byte[]. cnt must be positive int.
3511void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) {
3512  Label Lloop;
3513  mtctr(cnt);
3514
3515  bind(Lloop);
3516  lhz(tmp, 0, src);
3517  cmplwi(CCR0, tmp, 0xff);
3518  bgt(CCR0, Lfailure);            // Not latin1.
3519  addi(src, src, 2);
3520  stb(tmp, 0, dst);
3521  addi(dst, dst, 1);
3522  bdnz(Lloop);
3523}
3524
3525// Inflate byte[] to char[] by inflating 16 bytes at once.
3526void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
3527                                       Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
3528  const Register tmp0 = R0;
3529  assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3530  Label Lloop, Lslow;
3531
3532  // Check if cnt >= 8
3533  srwi_(tmp2, cnt, 3);
3534  beq(CCR0, Lslow);
3535  lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF
3536  ori(tmp1, tmp1, 0xFF);
3537  mtctr(tmp2);
3538
3539  // 2x unrolled loop
3540  bind(Lloop);
3541  lwz(tmp2, 0, src);              // ____0123 (Big Endian)
3542  lwz(tmp4, 4, src);              // ____4567
3543  addi(src, src, 8);
3544
3545  rldicl(tmp3, tmp2, 7*8, 64-8);  // _______2
3546  rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
3547  rldicl(tmp5, tmp4, 7*8, 64-8);  // _______6
3548  rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
3549
3550  andc(tmp0, tmp2, tmp1);         // ____0_1_
3551  rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
3552  andc(tmp3, tmp4, tmp1);         // ____4_5_
3553  rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
3554
3555  rldimi(tmp2, tmp0, 3*8, 0*8);   // _0_1_2_3
3556  rldimi(tmp4, tmp3, 3*8, 0*8);   // _4_5_6_7
3557
3558  std(tmp2, 0, dst);
3559  std(tmp4, 8, dst);
3560  addi(dst, dst, 16);
3561  bdnz(Lloop);
3562
3563  bind(Lslow);                    // Fallback to slow version
3564}
3565
3566// Inflate byte[] to char[]. cnt must be positive int.
3567void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
3568  Label Lloop;
3569  mtctr(cnt);
3570
3571  bind(Lloop);
3572  lbz(tmp, 0, src);
3573  addi(src, src, 1);
3574  sth(tmp, 0, dst);
3575  addi(dst, dst, 2);
3576  bdnz(Lloop);
3577}
3578
3579void MacroAssembler::string_compare(Register str1, Register str2,
3580                                    Register cnt1, Register cnt2,
3581                                    Register tmp1, Register result, int ae) {
3582  const Register tmp0 = R0,
3583                 diff = tmp1;
3584
3585  assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
3586  Label Ldone, Lslow, Lloop, Lreturn_diff;
3587
3588  // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
3589  // we interchange str1 and str2 in the UL case and negate the result.
3590  // Like this, str1 is always latin1 encoded, except for the UU case.
3591  // In addition, we need 0 (or sign which is 0) extend.
3592
3593  if (ae == StrIntrinsicNode::UU) {
3594    srwi(cnt1, cnt1, 1);
3595  } else {
3596    clrldi(cnt1, cnt1, 32);
3597  }
3598
3599  if (ae != StrIntrinsicNode::LL) {
3600    srwi(cnt2, cnt2, 1);
3601  } else {
3602    clrldi(cnt2, cnt2, 32);
3603  }
3604
3605  // See if the lengths are different, and calculate min in cnt1.
3606  // Save diff in case we need it for a tie-breaker.
3607  subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
3608  // if (diff > 0) { cnt1 = cnt2; }
3609  if (VM_Version::has_isel()) {
3610    isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
3611  } else {
3612    Label Lskip;
3613    blt(CCR0, Lskip);
3614    mr(cnt1, cnt2);
3615    bind(Lskip);
3616  }
3617
3618  // Rename registers
3619  Register chr1 = result;
3620  Register chr2 = tmp0;
3621
3622  // Compare multiple characters in fast loop (only implemented for same encoding).
3623  int stride1 = 8, stride2 = 8;
3624  if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3625    int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
3626    Label Lfastloop, Lskipfast;
3627
3628    srwi_(tmp0, cnt1, log2_chars_per_iter);
3629    beq(CCR0, Lskipfast);
3630    rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
3631    li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
3632    mtctr(tmp0);
3633
3634    bind(Lfastloop);
3635    ld(chr1, 0, str1);
3636    ld(chr2, 0, str2);
3637    cmpd(CCR0, chr1, chr2);
3638    bne(CCR0, Lslow);
3639    addi(str1, str1, stride1);
3640    addi(str2, str2, stride2);
3641    bdnz(Lfastloop);
3642    mr(cnt1, cnt2); // Remaining characters.
3643    bind(Lskipfast);
3644  }
3645
3646  // Loop which searches the first difference character by character.
3647  cmpwi(CCR0, cnt1, 0);
3648  beq(CCR0, Lreturn_diff);
3649  bind(Lslow);
3650  mtctr(cnt1);
3651
3652  switch (ae) {
3653    case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
3654    case StrIntrinsicNode::UL: // fallthru (see comment above)
3655    case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
3656    case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
3657    default: ShouldNotReachHere(); break;
3658  }
3659
3660  bind(Lloop);
3661  if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
3662  if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
3663  subf_(result, chr2, chr1); // result = chr1 - chr2
3664  bne(CCR0, Ldone);
3665  addi(str1, str1, stride1);
3666  addi(str2, str2, stride2);
3667  bdnz(Lloop);
3668
3669  // If strings are equal up to min length, return the length difference.
3670  bind(Lreturn_diff);
3671  mr(result, diff);
3672
3673  // Otherwise, return the difference between the first mismatched chars.
3674  bind(Ldone);
3675  if (ae == StrIntrinsicNode::UL) {
3676    neg(result, result); // Negate result (see note above).
3677  }
3678}
3679
3680void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
3681                                  Register limit, Register tmp1, Register result, bool is_byte) {
3682  const Register tmp0 = R0;
3683  assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
3684  Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
3685  bool limit_needs_shift = false;
3686
3687  if (is_array_equ) {
3688    const int length_offset = arrayOopDesc::length_offset_in_bytes();
3689    const int base_offset   = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
3690
3691    // Return true if the same array.
3692    cmpd(CCR0, ary1, ary2);
3693    beq(CCR0, Lskiploop);
3694
3695    // Return false if one of them is NULL.
3696    cmpdi(CCR0, ary1, 0);
3697    cmpdi(CCR1, ary2, 0);
3698    li(result, 0);
3699    cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
3700    beq(CCR0, Ldone);
3701
3702    // Load the lengths of arrays.
3703    lwz(limit, length_offset, ary1);
3704    lwz(tmp0, length_offset, ary2);
3705
3706    // Return false if the two arrays are not equal length.
3707    cmpw(CCR0, limit, tmp0);
3708    bne(CCR0, Ldone);
3709
3710    // Load array addresses.
3711    addi(ary1, ary1, base_offset);
3712    addi(ary2, ary2, base_offset);
3713  } else {
3714    limit_needs_shift = !is_byte;
3715    li(result, 0); // Assume not equal.
3716  }
3717
3718  // Rename registers
3719  Register chr1 = tmp0;
3720  Register chr2 = tmp1;
3721
3722  // Compare 8 bytes per iteration in fast loop.
3723  const int log2_chars_per_iter = is_byte ? 3 : 2;
3724
3725  srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
3726  beq(CCR0, Lskipfast);
3727  mtctr(tmp0);
3728
3729  bind(Lfastloop);
3730  ld(chr1, 0, ary1);
3731  ld(chr2, 0, ary2);
3732  addi(ary1, ary1, 8);
3733  addi(ary2, ary2, 8);
3734  cmpd(CCR0, chr1, chr2);
3735  bne(CCR0, Ldone);
3736  bdnz(Lfastloop);
3737
3738  bind(Lskipfast);
3739  rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
3740  beq(CCR0, Lskiploop);
3741  mtctr(limit);
3742
3743  // Character by character.
3744  bind(Lloop);
3745  if (is_byte) {
3746    lbz(chr1, 0, ary1);
3747    lbz(chr2, 0, ary2);
3748    addi(ary1, ary1, 1);
3749    addi(ary2, ary2, 1);
3750  } else {
3751    lhz(chr1, 0, ary1);
3752    lhz(chr2, 0, ary2);
3753    addi(ary1, ary1, 2);
3754    addi(ary2, ary2, 2);
3755  }
3756  cmpw(CCR0, chr1, chr2);
3757  bne(CCR0, Ldone);
3758  bdnz(Lloop);
3759
3760  bind(Lskiploop);
3761  li(result, 1); // All characters are equal.
3762  bind(Ldone);
3763}
3764
3765void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3766                                    Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3767                                    Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
3768
3769  // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3770  Label L_TooShort, L_Found, L_NotFound, L_End;
3771  Register last_addr = haycnt, // Kill haycnt at the beginning.
3772  addr      = tmp1,
3773  n_start   = tmp2,
3774  ch1       = tmp3,
3775  ch2       = R0;
3776
3777  assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3778  const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
3779  const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
3780
3781  // **************************************************************************************************
3782  // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3783  // **************************************************************************************************
3784
3785  // Compute last haystack addr to use if no match gets found.
3786  clrldi(haycnt, haycnt, 32);         // Ensure positive int is valid as 64 bit value.
3787  addi(addr, haystack, -h_csize);     // Accesses use pre-increment.
3788  if (needlecntval == 0) { // variable needlecnt
3789   cmpwi(CCR6, needlecnt, 2);
3790   clrldi(needlecnt, needlecnt, 32);  // Ensure positive int is valid as 64 bit value.
3791   blt(CCR6, L_TooShort);             // Variable needlecnt: handle short needle separately.
3792  }
3793
3794  if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
3795
3796  if (needlecntval == 0) { // variable needlecnt
3797   subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3798   addi(needlecnt, needlecnt, -2);    // Rest of needle.
3799  } else { // constant needlecnt
3800  guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3801  assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3802   addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3803   if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
3804  }
3805
3806  if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
3807
3808  if (ae ==StrIntrinsicNode::UL) {
3809   srwi(tmp4, n_start, 1*8);          // ___0
3810   rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
3811  }
3812
3813  add(last_addr, haystack, ch1);      // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3814
3815  // Main Loop (now we have at least 2 characters).
3816  Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
3817  bind(L_OuterLoop); // Search for 1st 2 characters.
3818  Register addr_diff = tmp4;
3819   subf(addr_diff, addr, last_addr);  // Difference between already checked address and last address to check.
3820   addi(addr, addr, h_csize);         // This is the new address we want to use for comparing.
3821   srdi_(ch2, addr_diff, h_csize);
3822   beq(CCR0, L_FinalCheck);           // 2 characters left?
3823   mtctr(ch2);                        // num of characters / 2
3824  bind(L_InnerLoop);                  // Main work horse (2x unrolled search loop)
3825   if (h_csize == 2) {                // Load 2 characters of haystack (ignore alignment).
3826    lwz(ch1, 0, addr);
3827    lwz(ch2, 2, addr);
3828   } else {
3829    lhz(ch1, 0, addr);
3830    lhz(ch2, 1, addr);
3831   }
3832   cmpw(CCR0, ch1, n_start);          // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3833   cmpw(CCR1, ch2, n_start);
3834   beq(CCR0, L_Comp1);                // Did we find the needle start?
3835   beq(CCR1, L_Comp2);
3836   addi(addr, addr, 2 * h_csize);
3837   bdnz(L_InnerLoop);
3838  bind(L_FinalCheck);
3839   andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
3840   beq(CCR0, L_NotFound);
3841   if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
3842   cmpw(CCR1, ch1, n_start);
3843   beq(CCR1, L_Comp1);
3844  bind(L_NotFound);
3845   li(result, -1);                    // not found
3846   b(L_End);
3847
3848   // **************************************************************************************************
3849   // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3850   // **************************************************************************************************
3851  if (needlecntval == 0) {           // We have to handle these cases separately.
3852  Label L_OneCharLoop;
3853  bind(L_TooShort);
3854   mtctr(haycnt);
3855   if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
3856  bind(L_OneCharLoop);
3857   if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
3858   cmpw(CCR1, ch1, n_start);
3859   beq(CCR1, L_Found);               // Did we find the one character needle?
3860   bdnz(L_OneCharLoop);
3861   li(result, -1);                   // Not found.
3862   b(L_End);
3863  }
3864
3865  // **************************************************************************************************
3866  // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3867  // **************************************************************************************************
3868
3869  // Compare the rest
3870  bind(L_Comp2);
3871   addi(addr, addr, h_csize);        // First comparison has failed, 2nd one hit.
3872  bind(L_Comp1);                     // Addr points to possible needle start.
3873  if (needlecntval != 2) {           // Const needlecnt==2?
3874   if (needlecntval != 3) {
3875    if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
3876    Register n_ind = tmp4,
3877             h_ind = n_ind;
3878    li(n_ind, 2 * n_csize);          // First 2 characters are already compared, use index 2.
3879    mtctr(needlecnt);                // Decremented by 2, still > 0.
3880   Label L_CompLoop;
3881   bind(L_CompLoop);
3882    if (ae ==StrIntrinsicNode::UL) {
3883      h_ind = ch1;
3884      sldi(h_ind, n_ind, 1);
3885    }
3886    if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
3887    if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
3888    cmpw(CCR1, ch1, ch2);
3889    bne(CCR1, L_OuterLoop);
3890    addi(n_ind, n_ind, n_csize);
3891    bdnz(L_CompLoop);
3892   } else { // No loop required if there's only one needle character left.
3893    if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
3894    if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
3895    cmpw(CCR1, ch1, ch2);
3896    bne(CCR1, L_OuterLoop);
3897   }
3898  }
3899  // Return index ...
3900  bind(L_Found);
3901   subf(result, haystack, addr);     // relative to haystack, ...
3902   if (h_csize == 2) { srdi(result, result, 1); } // in characters.
3903  bind(L_End);
3904} // string_indexof
3905
3906void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
3907                                         Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
3908  assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
3909
3910  Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
3911  Register addr = tmp1,
3912           ch1 = tmp2,
3913           ch2 = R0;
3914
3915  const int h_csize = is_byte ? 1 : 2;
3916
3917//4:
3918   srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3919   mr(addr, haystack);
3920   beq(CCR0, L_FinalCheck);
3921   mtctr(tmp2);              // Move to count register.
3922//8:
3923  bind(L_InnerLoop);         // Main work horse (2x unrolled search loop).
3924   if (!is_byte) {
3925    lhz(ch1, 0, addr);
3926    lhz(ch2, 2, addr);
3927   } else {
3928    lbz(ch1, 0, addr);
3929    lbz(ch2, 1, addr);
3930   }
3931   (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
3932   (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
3933   beq(CCR0, L_Found1);      // Did we find the needle?
3934   beq(CCR1, L_Found2);
3935   addi(addr, addr, 2 * h_csize);
3936   bdnz(L_InnerLoop);
3937//16:
3938  bind(L_FinalCheck);
3939   andi_(R0, haycnt, 1);
3940   beq(CCR0, L_NotFound);
3941   if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
3942   (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
3943   beq(CCR1, L_Found1);
3944//21:
3945  bind(L_NotFound);
3946   li(result, -1);           // Not found.
3947   b(L_End);
3948
3949  bind(L_Found2);
3950   addi(addr, addr, h_csize);
3951//24:
3952  bind(L_Found1);            // Return index ...
3953   subf(result, haystack, addr); // relative to haystack, ...
3954   if (!is_byte) { srdi(result, result, 1); } // in characters.
3955  bind(L_End);
3956} // string_indexof_char
3957
3958
3959void MacroAssembler::has_negatives(Register src, Register cnt, Register result,
3960                                   Register tmp1, Register tmp2) {
3961  const Register tmp0 = R0;
3962  assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
3963  Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone;
3964
3965  // Check if cnt >= 8 (= 16 bytes)
3966  lis(tmp1, (int)(short)0x8080);  // tmp1 = 0x8080808080808080
3967  srwi_(tmp2, cnt, 4);
3968  li(result, 1);                  // Assume there's a negative byte.
3969  beq(CCR0, Lslow);
3970  ori(tmp1, tmp1, 0x8080);
3971  rldimi(tmp1, tmp1, 32, 0);
3972  mtctr(tmp2);
3973
3974  // 2x unrolled loop
3975  bind(Lfastloop);
3976  ld(tmp2, 0, src);
3977  ld(tmp0, 8, src);
3978
3979  orr(tmp0, tmp2, tmp0);
3980
3981  and_(tmp0, tmp0, tmp1);
3982  bne(CCR0, Ldone);               // Found negative byte.
3983  addi(src, src, 16);
3984
3985  bdnz(Lfastloop);
3986
3987  bind(Lslow);                    // Fallback to slow version
3988  rldicl_(tmp0, cnt, 0, 64-4);
3989  beq(CCR0, Lnoneg);
3990  mtctr(tmp0);
3991  bind(Lloop);
3992  lbz(tmp0, 0, src);
3993  addi(src, src, 1);
3994  andi_(tmp0, tmp0, 0x80);
3995  bne(CCR0, Ldone);               // Found negative byte.
3996  bdnz(Lloop);
3997  bind(Lnoneg);
3998  li(result, 0);
3999
4000  bind(Ldone);
4001}
4002
4003#endif // Compiler2
4004
4005// Helpers for Intrinsic Emitters
4006//
4007// Revert the byte order of a 32bit value in a register
4008//   src: 0x44556677
4009//   dst: 0x77665544
4010// Three steps to obtain the result:
4011//  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
4012//     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
4013//     This value initializes dst.
4014//  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
4015//     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
4016//     This value is mask inserted into dst with a [0..23] mask of 1s.
4017//  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
4018//     This value is mask inserted into dst with a [8..15] mask of 1s.
4019void MacroAssembler::load_reverse_32(Register dst, Register src) {
4020  assert_different_registers(dst, src);
4021
4022  rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
4023  rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
4024  rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
4025}
4026
4027// Calculate the column addresses of the crc32 lookup table into distinct registers.
4028// This loop-invariant calculation is moved out of the loop body, reducing the loop
4029// body size from 20 to 16 instructions.
4030// Returns the offset that was used to calculate the address of column tc3.
4031// Due to register shortage, setting tc3 may overwrite table. With the return offset
4032// at hand, the original table address can be easily reconstructed.
4033int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
4034
4035#ifdef VM_LITTLE_ENDIAN
4036  // This is what we implement (the DOLIT4 part):
4037  // ========================================================================= */
4038  // #define DOLIT4 c ^= *buf4++; \
4039  //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
4040  //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
4041  // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
4042  // ========================================================================= */
4043  const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
4044  const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
4045  const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
4046  const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
4047#else
4048  // This is what we implement (the DOBIG4 part):
4049  // =========================================================================
4050  // #define DOBIG4 c ^= *++buf4; \
4051  //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
4052  //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
4053  // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
4054  // =========================================================================
4055  const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
4056  const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
4057  const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
4058  const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
4059#endif
4060  assert_different_registers(table, tc0, tc1, tc2);
4061  assert(table == tc3, "must be!");
4062
4063  addi(tc0, table, ix0);
4064  addi(tc1, table, ix1);
4065  addi(tc2, table, ix2);
4066  if (ix3 != 0) addi(tc3, table, ix3);
4067
4068  return ix3;
4069}
4070
4071/**
4072 * uint32_t crc;
4073 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4074 */
4075void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
4076  assert_different_registers(crc, table, tmp);
4077  assert_different_registers(val, table);
4078
4079  if (crc == val) {                   // Must rotate first to use the unmodified value.
4080    rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4081                                      // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
4082    srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4083  } else {
4084    srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4085    rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4086  }
4087  lwzx(tmp, table, tmp);
4088  xorr(crc, crc, tmp);
4089}
4090
4091/**
4092 * uint32_t crc;
4093 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4094 */
4095void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
4096  fold_byte_crc32(crc, crc, table, tmp);
4097}
4098
4099/**
4100 * Emits code to update CRC-32 with a byte value according to constants in table.
4101 *
4102 * @param [in,out]crc   Register containing the crc.
4103 * @param [in]val       Register containing the byte to fold into the CRC.
4104 * @param [in]table     Register containing the table of crc constants.
4105 *
4106 * uint32_t crc;
4107 * val = crc_table[(val ^ crc) & 0xFF];
4108 * crc = val ^ (crc >> 8);
4109 */
4110void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
4111  BLOCK_COMMENT("update_byte_crc32:");
4112  xorr(val, val, crc);
4113  fold_byte_crc32(crc, val, table, val);
4114}
4115
4116/**
4117 * @param crc   register containing existing CRC (32-bit)
4118 * @param buf   register pointing to input byte buffer (byte*)
4119 * @param len   register containing number of bytes
4120 * @param table register pointing to CRC table
4121 */
4122void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
4123                                           Register data, bool loopAlignment, bool invertCRC) {
4124  assert_different_registers(crc, buf, len, table, data);
4125
4126  Label L_mainLoop, L_done;
4127  const int mainLoop_stepping  = 1;
4128  const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
4129
4130  // Process all bytes in a single-byte loop.
4131  clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
4132  beq(CCR0, L_done);
4133
4134  if (invertCRC) {
4135    nand(crc, crc, crc);                         // ~c
4136  }
4137
4138  mtctr(len);
4139  align(mainLoop_alignment);
4140  BIND(L_mainLoop);
4141    lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
4142    addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
4143    update_byte_crc32(crc, data, table);
4144    bdnz(L_mainLoop);                            // Iterate.
4145
4146  if (invertCRC) {
4147    nand(crc, crc, crc);                         // ~c
4148  }
4149
4150  bind(L_done);
4151}
4152
4153/**
4154 * Emits code to update CRC-32 with a 4-byte value according to constants in table
4155 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
4156 */
4157// A not on the lookup table address(es):
4158// The lookup table consists of two sets of four columns each.
4159// The columns {0..3} are used for little-endian machines.
4160// The columns {4..7} are used for big-endian machines.
4161// To save the effort of adding the column offset to the table address each time
4162// a table element is looked up, it is possible to pass the pre-calculated
4163// column addresses.
4164// Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
4165void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
4166                                        Register t0,  Register t1,  Register t2,  Register t3,
4167                                        Register tc0, Register tc1, Register tc2, Register tc3) {
4168  assert_different_registers(crc, t3);
4169
4170  // XOR crc with next four bytes of buffer.
4171  lwz(t3, bufDisp, buf);
4172  if (bufInc != 0) {
4173    addi(buf, buf, bufInc);
4174  }
4175  xorr(t3, t3, crc);
4176
4177  // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
4178  rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
4179  rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
4180  rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
4181  rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
4182
4183  // Use the pre-calculated column addresses.
4184  // Load pre-calculated table values.
4185  lwzx(t0, tc0, t0);
4186  lwzx(t1, tc1, t1);
4187  lwzx(t2, tc2, t2);
4188  lwzx(t3, tc3, t3);
4189
4190  // Calculate new crc from table values.
4191  xorr(t0,  t0, t1);
4192  xorr(t2,  t2, t3);
4193  xorr(crc, t0, t2);  // Now crc contains the final checksum value.
4194}
4195
4196/**
4197 * @param crc   register containing existing CRC (32-bit)
4198 * @param buf   register pointing to input byte buffer (byte*)
4199 * @param len   register containing number of bytes
4200 * @param table register pointing to CRC table
4201 *
4202 * Uses R9..R12 as work register. Must be saved/restored by caller!
4203 */
4204void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
4205                                        Register t0,  Register t1,  Register t2,  Register t3,
4206                                        Register tc0, Register tc1, Register tc2, Register tc3) {
4207  assert_different_registers(crc, buf, len, table);
4208
4209  Label L_mainLoop, L_tail;
4210  Register  tmp  = t0;
4211  Register  data = t0;
4212  Register  tmp2 = t1;
4213  const int mainLoop_stepping  = 8;
4214  const int tailLoop_stepping  = 1;
4215  const int log_stepping       = exact_log2(mainLoop_stepping);
4216  const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4217  const int complexThreshold   = 2*mainLoop_stepping;
4218
4219  // Don't test for len <= 0 here. This pathological case should not occur anyway.
4220  // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
4221  // The situation itself is detected and handled correctly by the conditional branches
4222  // following  aghi(len, -stepping) and aghi(len, +stepping).
4223  assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4224
4225  BLOCK_COMMENT("kernel_crc32_2word {");
4226
4227  nand(crc, crc, crc);                           // ~c
4228
4229  // Check for short (<mainLoop_stepping) buffer.
4230  cmpdi(CCR0, len, complexThreshold);
4231  blt(CCR0, L_tail);
4232
4233  // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4234  // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4235  {
4236    // Align buf addr to mainLoop_stepping boundary.
4237    neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
4238    rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4239
4240    if (complexThreshold > mainLoop_stepping) {
4241      sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4242    } else {
4243      sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4244      cmpdi(CCR0, tmp, mainLoop_stepping);
4245      blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4246      mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4247    }
4248    update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
4249  }
4250
4251  srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4252  andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4253  mtctr(tmp2);
4254
4255#ifdef VM_LITTLE_ENDIAN
4256  Register crc_rv = crc;
4257#else
4258  Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4259                                                 // Occupies tmp, but frees up crc.
4260  load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4261  tmp = crc;
4262#endif
4263
4264  int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4265
4266  align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4267  BIND(L_mainLoop);
4268    update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4269    update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4270    bdnz(L_mainLoop);
4271
4272#ifndef VM_LITTLE_ENDIAN
4273  load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4274  tmp = crc_rv;                                  // Tmp uses it's original register again.
4275#endif
4276
4277  // Restore original table address for tailLoop.
4278  if (reconstructTableOffset != 0) {
4279    addi(table, table, -reconstructTableOffset);
4280  }
4281
4282  // Process last few (<complexThreshold) bytes of buffer.
4283  BIND(L_tail);
4284  update_byteLoop_crc32(crc, buf, len, table, data, false, false);
4285
4286  nand(crc, crc, crc);                           // ~c
4287  BLOCK_COMMENT("} kernel_crc32_2word");
4288}
4289
4290/**
4291 * @param crc   register containing existing CRC (32-bit)
4292 * @param buf   register pointing to input byte buffer (byte*)
4293 * @param len   register containing number of bytes
4294 * @param table register pointing to CRC table
4295 *
4296 * uses R9..R12 as work register. Must be saved/restored by caller!
4297 */
4298void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4299                                        Register t0,  Register t1,  Register t2,  Register t3,
4300                                        Register tc0, Register tc1, Register tc2, Register tc3) {
4301  assert_different_registers(crc, buf, len, table);
4302
4303  Label L_mainLoop, L_tail;
4304  Register  tmp          = t0;
4305  Register  data         = t0;
4306  Register  tmp2         = t1;
4307  const int mainLoop_stepping  = 4;
4308  const int tailLoop_stepping  = 1;
4309  const int log_stepping       = exact_log2(mainLoop_stepping);
4310  const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4311  const int complexThreshold   = 2*mainLoop_stepping;
4312
4313  // Don't test for len <= 0 here. This pathological case should not occur anyway.
4314  // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
4315  // The situation itself is detected and handled correctly by the conditional branches
4316  // following  aghi(len, -stepping) and aghi(len, +stepping).
4317  assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4318
4319  BLOCK_COMMENT("kernel_crc32_1word {");
4320
4321  nand(crc, crc, crc);                           // ~c
4322
4323  // Check for short (<mainLoop_stepping) buffer.
4324  cmpdi(CCR0, len, complexThreshold);
4325  blt(CCR0, L_tail);
4326
4327  // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4328  // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4329  {
4330    // Align buf addr to mainLoop_stepping boundary.
4331    neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
4332    rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4333
4334    if (complexThreshold > mainLoop_stepping) {
4335      sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4336    } else {
4337      sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4338      cmpdi(CCR0, tmp, mainLoop_stepping);
4339      blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4340      mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4341    }
4342    update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
4343  }
4344
4345  srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4346  andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4347  mtctr(tmp2);
4348
4349#ifdef VM_LITTLE_ENDIAN
4350  Register crc_rv = crc;
4351#else
4352  Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4353                                                 // Occupies tmp, but frees up crc.
4354  load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4355  tmp = crc;
4356#endif
4357
4358  int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4359
4360  align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4361  BIND(L_mainLoop);
4362    update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4363    bdnz(L_mainLoop);
4364
4365#ifndef VM_LITTLE_ENDIAN
4366  load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4367  tmp = crc_rv;                                  // Tmp uses it's original register again.
4368#endif
4369
4370  // Restore original table address for tailLoop.
4371  if (reconstructTableOffset != 0) {
4372    addi(table, table, -reconstructTableOffset);
4373  }
4374
4375  // Process last few (<complexThreshold) bytes of buffer.
4376  BIND(L_tail);
4377  update_byteLoop_crc32(crc, buf, len, table, data, false, false);
4378
4379  nand(crc, crc, crc);                           // ~c
4380  BLOCK_COMMENT("} kernel_crc32_1word");
4381}
4382
4383/**
4384 * @param crc   register containing existing CRC (32-bit)
4385 * @param buf   register pointing to input byte buffer (byte*)
4386 * @param len   register containing number of bytes
4387 * @param table register pointing to CRC table
4388 *
4389 * Uses R7_ARG5, R8_ARG6 as work registers.
4390 */
4391void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4392                                        Register t0,  Register t1,  Register t2,  Register t3) {
4393  assert_different_registers(crc, buf, len, table);
4394
4395  Register  data = t0;                   // Holds the current byte to be folded into crc.
4396
4397  BLOCK_COMMENT("kernel_crc32_1byte {");
4398
4399  // Process all bytes in a single-byte loop.
4400  update_byteLoop_crc32(crc, buf, len, table, data, true, true);
4401
4402  BLOCK_COMMENT("} kernel_crc32_1byte");
4403}
4404
4405/**
4406 * @param crc             register containing existing CRC (32-bit)
4407 * @param buf             register pointing to input byte buffer (byte*)
4408 * @param len             register containing number of bytes
4409 * @param table           register pointing to CRC table
4410 * @param constants       register pointing to CRC table for 128-bit aligned memory
4411 * @param barretConstants register pointing to table for barrett reduction
4412 * @param t0              volatile register
4413 * @param t1              volatile register
4414 * @param t2              volatile register
4415 * @param t3              volatile register
4416 */
4417void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
4418                                                Register constants,  Register barretConstants,
4419                                                Register t0,  Register t1, Register t2, Register t3, Register t4) {
4420  assert_different_registers(crc, buf, len, table);
4421
4422  Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
4423
4424  Register  prealign     = t0;
4425  Register  postalign    = t0;
4426
4427  BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
4428
4429  // 1. use kernel_crc32_1word for shorter than 384bit
4430  clrldi(len, len, 32);
4431  cmpdi(CCR0, len, 384);
4432  bge(CCR0, L_start);
4433
4434    Register tc0 = t4;
4435    Register tc1 = constants;
4436    Register tc2 = barretConstants;
4437    kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table);
4438    b(L_end);
4439
4440  BIND(L_start);
4441
4442    // 2. ~c
4443    nand(crc, crc, crc);
4444
4445    // 3. calculate from 0 to first 128bit-aligned address
4446    clrldi_(prealign, buf, 57);
4447    beq(CCR0, L_alignedHead);
4448
4449    subfic(prealign, prealign, 128);
4450
4451    subf(len, prealign, len);
4452    update_byteLoop_crc32(crc, buf, prealign, table, t2, false, false);
4453
4454    // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
4455    BIND(L_alignedHead);
4456
4457    clrldi(postalign, len, 57);
4458    subf(len, postalign, len);
4459
4460    // len must be more than 256bit
4461    kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
4462
4463    // 5. calculate remaining
4464    cmpdi(CCR0, postalign, 0);
4465    beq(CCR0, L_tail);
4466
4467    update_byteLoop_crc32(crc, buf, postalign, table, t2, false, false);
4468
4469    BIND(L_tail);
4470
4471    // 6. ~c
4472    nand(crc, crc, crc);
4473
4474  BIND(L_end);
4475
4476  BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
4477}
4478
4479/**
4480 * @param crc             register containing existing CRC (32-bit)
4481 * @param buf             register pointing to input byte buffer (byte*)
4482 * @param len             register containing number of bytes
4483 * @param constants       register pointing to CRC table for 128-bit aligned memory
4484 * @param barretConstants register pointing to table for barrett reduction
4485 * @param t0              volatile register
4486 * @param t1              volatile register
4487 * @param t2              volatile register
4488 */
4489void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
4490    Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
4491  Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
4492  Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;
4493  Label L_1, L_2, L_3, L_4;
4494
4495  Register  rLoaded      = t0;
4496  Register  rTmp1        = t1;
4497  Register  rTmp2        = t2;
4498  Register  off16        = R22;
4499  Register  off32        = R23;
4500  Register  off48        = R24;
4501  Register  off64        = R25;
4502  Register  off80        = R26;
4503  Register  off96        = R27;
4504  Register  off112       = R28;
4505  Register  rIdx         = R29;
4506  Register  rMax         = R30;
4507  Register  constantsPos = R31;
4508
4509  VectorRegister mask_32bit = VR24;
4510  VectorRegister mask_64bit = VR25;
4511  VectorRegister zeroes     = VR26;
4512  VectorRegister const1     = VR27;
4513  VectorRegister const2     = VR28;
4514
4515  // Save non-volatile vector registers (frameless).
4516  Register offset = t1;   int offsetInt = 0;
4517  offsetInt -= 16; li(offset, -16);           stvx(VR20, offset, R1_SP);
4518  offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP);
4519  offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP);
4520  offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP);
4521  offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP);
4522  offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP);
4523  offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP);
4524  offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP);
4525  offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP);
4526  offsetInt -= 8; std(R22, offsetInt, R1_SP);
4527  offsetInt -= 8; std(R23, offsetInt, R1_SP);
4528  offsetInt -= 8; std(R24, offsetInt, R1_SP);
4529  offsetInt -= 8; std(R25, offsetInt, R1_SP);
4530  offsetInt -= 8; std(R26, offsetInt, R1_SP);
4531  offsetInt -= 8; std(R27, offsetInt, R1_SP);
4532  offsetInt -= 8; std(R28, offsetInt, R1_SP);
4533  offsetInt -= 8; std(R29, offsetInt, R1_SP);
4534  offsetInt -= 8; std(R30, offsetInt, R1_SP);
4535  offsetInt -= 8; std(R31, offsetInt, R1_SP);
4536
4537  // Set constants
4538  li(off16, 16);
4539  li(off32, 32);
4540  li(off48, 48);
4541  li(off64, 64);
4542  li(off80, 80);
4543  li(off96, 96);
4544  li(off112, 112);
4545
4546  clrldi(crc, crc, 32);
4547
4548  vxor(zeroes, zeroes, zeroes);
4549  vspltisw(VR0, -1);
4550
4551  vsldoi(mask_32bit, zeroes, VR0, 4);
4552  vsldoi(mask_64bit, zeroes, VR0, -8);
4553
4554  // Get the initial value into v8
4555  vxor(VR8, VR8, VR8);
4556  mtvrd(VR8, crc);
4557  vsldoi(VR8, zeroes, VR8, -8); // shift into bottom 32 bits
4558
4559  li (rLoaded, 0);
4560
4561  rldicr(rIdx, len, 0, 56);
4562
4563  {
4564    BIND(L_1);
4565    // Checksum in blocks of MAX_SIZE (32768)
4566    lis(rMax, 0);
4567    ori(rMax, rMax, 32768);
4568    mr(rTmp2, rMax);
4569    cmpd(CCR0, rIdx, rMax);
4570    bgt(CCR0, L_2);
4571    mr(rMax, rIdx);
4572
4573    BIND(L_2);
4574    subf(rIdx, rMax, rIdx);
4575
4576    // our main loop does 128 bytes at a time
4577    srdi(rMax, rMax, 7);
4578
4579    /*
4580     * Work out the offset into the constants table to start at. Each
4581     * constant is 16 bytes, and it is used against 128 bytes of input
4582     * data - 128 / 16 = 8
4583     */
4584    sldi(rTmp1, rMax, 4);
4585    srdi(rTmp2, rTmp2, 3);
4586    subf(rTmp1, rTmp1, rTmp2);
4587
4588    // We reduce our final 128 bytes in a separate step
4589    addi(rMax, rMax, -1);
4590    mtctr(rMax);
4591
4592    // Find the start of our constants
4593    add(constantsPos, constants, rTmp1);
4594
4595    // zero VR0-v7 which will contain our checksums
4596    vxor(VR0, VR0, VR0);
4597    vxor(VR1, VR1, VR1);
4598    vxor(VR2, VR2, VR2);
4599    vxor(VR3, VR3, VR3);
4600    vxor(VR4, VR4, VR4);
4601    vxor(VR5, VR5, VR5);
4602    vxor(VR6, VR6, VR6);
4603    vxor(VR7, VR7, VR7);
4604
4605    lvx(const1, constantsPos);
4606
4607    /*
4608     * If we are looping back to consume more data we use the values
4609     * already in VR16-v23.
4610     */
4611    cmpdi(CCR0, rLoaded, 1);
4612    beq(CCR0, L_3);
4613    {
4614
4615      // First warm up pass
4616      lvx(VR16, buf);
4617      lvx(VR17, off16, buf);
4618      lvx(VR18, off32, buf);
4619      lvx(VR19, off48, buf);
4620      lvx(VR20, off64, buf);
4621      lvx(VR21, off80, buf);
4622      lvx(VR22, off96, buf);
4623      lvx(VR23, off112, buf);
4624      addi(buf, buf, 8*16);
4625
4626      // xor in initial value
4627      vxor(VR16, VR16, VR8);
4628    }
4629
4630    BIND(L_3);
4631    bdz(L_first_warm_up_done);
4632
4633    addi(constantsPos, constantsPos, 16);
4634    lvx(const2, constantsPos);
4635
4636    // Second warm up pass
4637    vpmsumd(VR8, VR16, const1);
4638    lvx(VR16, buf);
4639
4640    vpmsumd(VR9, VR17, const1);
4641    lvx(VR17, off16, buf);
4642
4643    vpmsumd(VR10, VR18, const1);
4644    lvx(VR18, off32, buf);
4645
4646    vpmsumd(VR11, VR19, const1);
4647    lvx(VR19, off48, buf);
4648
4649    vpmsumd(VR12, VR20, const1);
4650    lvx(VR20, off64, buf);
4651
4652    vpmsumd(VR13, VR21, const1);
4653    lvx(VR21, off80, buf);
4654
4655    vpmsumd(VR14, VR22, const1);
4656    lvx(VR22, off96, buf);
4657
4658    vpmsumd(VR15, VR23, const1);
4659    lvx(VR23, off112, buf);
4660
4661    addi(buf, buf, 8 * 16);
4662
4663    bdz(L_first_cool_down);
4664
4665    /*
4666     * main loop. We modulo schedule it such that it takes three iterations
4667     * to complete - first iteration load, second iteration vpmsum, third
4668     * iteration xor.
4669     */
4670    {
4671      BIND(L_4);
4672      lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16);
4673
4674      vxor(VR0, VR0, VR8);
4675      vpmsumd(VR8, VR16, const2);
4676      lvx(VR16, buf);
4677
4678      vxor(VR1, VR1, VR9);
4679      vpmsumd(VR9, VR17, const2);
4680      lvx(VR17, off16, buf);
4681
4682      vxor(VR2, VR2, VR10);
4683      vpmsumd(VR10, VR18, const2);
4684      lvx(VR18, off32, buf);
4685
4686      vxor(VR3, VR3, VR11);
4687      vpmsumd(VR11, VR19, const2);
4688      lvx(VR19, off48, buf);
4689      lvx(const2, constantsPos);
4690
4691      vxor(VR4, VR4, VR12);
4692      vpmsumd(VR12, VR20, const1);
4693      lvx(VR20, off64, buf);
4694
4695      vxor(VR5, VR5, VR13);
4696      vpmsumd(VR13, VR21, const1);
4697      lvx(VR21, off80, buf);
4698
4699      vxor(VR6, VR6, VR14);
4700      vpmsumd(VR14, VR22, const1);
4701      lvx(VR22, off96, buf);
4702
4703      vxor(VR7, VR7, VR15);
4704      vpmsumd(VR15, VR23, const1);
4705      lvx(VR23, off112, buf);
4706
4707      addi(buf, buf, 8 * 16);
4708
4709      bdnz(L_4);
4710    }
4711
4712    BIND(L_first_cool_down);
4713
4714    // First cool down pass
4715    lvx(const1, constantsPos);
4716    addi(constantsPos, constantsPos, 16);
4717
4718    vxor(VR0, VR0, VR8);
4719    vpmsumd(VR8, VR16, const1);
4720
4721    vxor(VR1, VR1, VR9);
4722    vpmsumd(VR9, VR17, const1);
4723
4724    vxor(VR2, VR2, VR10);
4725    vpmsumd(VR10, VR18, const1);
4726
4727    vxor(VR3, VR3, VR11);
4728    vpmsumd(VR11, VR19, const1);
4729
4730    vxor(VR4, VR4, VR12);
4731    vpmsumd(VR12, VR20, const1);
4732
4733    vxor(VR5, VR5, VR13);
4734    vpmsumd(VR13, VR21, const1);
4735
4736    vxor(VR6, VR6, VR14);
4737    vpmsumd(VR14, VR22, const1);
4738
4739    vxor(VR7, VR7, VR15);
4740    vpmsumd(VR15, VR23, const1);
4741
4742    BIND(L_second_cool_down);
4743    // Second cool down pass
4744    vxor(VR0, VR0, VR8);
4745    vxor(VR1, VR1, VR9);
4746    vxor(VR2, VR2, VR10);
4747    vxor(VR3, VR3, VR11);
4748    vxor(VR4, VR4, VR12);
4749    vxor(VR5, VR5, VR13);
4750    vxor(VR6, VR6, VR14);
4751    vxor(VR7, VR7, VR15);
4752
4753    /*
4754     * vpmsumd produces a 96 bit result in the least significant bits
4755     * of the register. Since we are bit reflected we have to shift it
4756     * left 32 bits so it occupies the least significant bits in the
4757     * bit reflected domain.
4758     */
4759    vsldoi(VR0, VR0, zeroes, 4);
4760    vsldoi(VR1, VR1, zeroes, 4);
4761    vsldoi(VR2, VR2, zeroes, 4);
4762    vsldoi(VR3, VR3, zeroes, 4);
4763    vsldoi(VR4, VR4, zeroes, 4);
4764    vsldoi(VR5, VR5, zeroes, 4);
4765    vsldoi(VR6, VR6, zeroes, 4);
4766    vsldoi(VR7, VR7, zeroes, 4);
4767
4768    // xor with last 1024 bits
4769    lvx(VR8, buf);
4770    lvx(VR9, off16, buf);
4771    lvx(VR10, off32, buf);
4772    lvx(VR11, off48, buf);
4773    lvx(VR12, off64, buf);
4774    lvx(VR13, off80, buf);
4775    lvx(VR14, off96, buf);
4776    lvx(VR15, off112, buf);
4777    addi(buf, buf, 8 * 16);
4778
4779    vxor(VR16, VR0, VR8);
4780    vxor(VR17, VR1, VR9);
4781    vxor(VR18, VR2, VR10);
4782    vxor(VR19, VR3, VR11);
4783    vxor(VR20, VR4, VR12);
4784    vxor(VR21, VR5, VR13);
4785    vxor(VR22, VR6, VR14);
4786    vxor(VR23, VR7, VR15);
4787
4788    li(rLoaded, 1);
4789    cmpdi(CCR0, rIdx, 0);
4790    addi(rIdx, rIdx, 128);
4791    bne(CCR0, L_1);
4792  }
4793
4794  // Work out how many bytes we have left
4795  andi_(len, len, 127);
4796
4797  // Calculate where in the constant table we need to start
4798  subfic(rTmp1, len, 128);
4799  add(constantsPos, constantsPos, rTmp1);
4800
4801  // How many 16 byte chunks are in the tail
4802  srdi(rIdx, len, 4);
4803  mtctr(rIdx);
4804
4805  /*
4806   * Reduce the previously calculated 1024 bits to 64 bits, shifting
4807   * 32 bits to include the trailing 32 bits of zeros
4808   */
4809  lvx(VR0, constantsPos);
4810  lvx(VR1, off16, constantsPos);
4811  lvx(VR2, off32, constantsPos);
4812  lvx(VR3, off48, constantsPos);
4813  lvx(VR4, off64, constantsPos);
4814  lvx(VR5, off80, constantsPos);
4815  lvx(VR6, off96, constantsPos);
4816  lvx(VR7, off112, constantsPos);
4817  addi(constantsPos, constantsPos, 8 * 16);
4818
4819  vpmsumw(VR0, VR16, VR0);
4820  vpmsumw(VR1, VR17, VR1);
4821  vpmsumw(VR2, VR18, VR2);
4822  vpmsumw(VR3, VR19, VR3);
4823  vpmsumw(VR4, VR20, VR4);
4824  vpmsumw(VR5, VR21, VR5);
4825  vpmsumw(VR6, VR22, VR6);
4826  vpmsumw(VR7, VR23, VR7);
4827
4828  // Now reduce the tail (0 - 112 bytes)
4829  cmpdi(CCR0, rIdx, 0);
4830  beq(CCR0, L_XOR);
4831
4832  lvx(VR16, buf); addi(buf, buf, 16);
4833  lvx(VR17, constantsPos);
4834  vpmsumw(VR16, VR16, VR17);
4835  vxor(VR0, VR0, VR16);
4836  beq(CCR0, L_XOR);
4837
4838  lvx(VR16, buf); addi(buf, buf, 16);
4839  lvx(VR17, off16, constantsPos);
4840  vpmsumw(VR16, VR16, VR17);
4841  vxor(VR0, VR0, VR16);
4842  beq(CCR0, L_XOR);
4843
4844  lvx(VR16, buf); addi(buf, buf, 16);
4845  lvx(VR17, off32, constantsPos);
4846  vpmsumw(VR16, VR16, VR17);
4847  vxor(VR0, VR0, VR16);
4848  beq(CCR0, L_XOR);
4849
4850  lvx(VR16, buf); addi(buf, buf, 16);
4851  lvx(VR17, off48,constantsPos);
4852  vpmsumw(VR16, VR16, VR17);
4853  vxor(VR0, VR0, VR16);
4854  beq(CCR0, L_XOR);
4855
4856  lvx(VR16, buf); addi(buf, buf, 16);
4857  lvx(VR17, off64, constantsPos);
4858  vpmsumw(VR16, VR16, VR17);
4859  vxor(VR0, VR0, VR16);
4860  beq(CCR0, L_XOR);
4861
4862  lvx(VR16, buf); addi(buf, buf, 16);
4863  lvx(VR17, off80, constantsPos);
4864  vpmsumw(VR16, VR16, VR17);
4865  vxor(VR0, VR0, VR16);
4866  beq(CCR0, L_XOR);
4867
4868  lvx(VR16, buf); addi(buf, buf, 16);
4869  lvx(VR17, off96, constantsPos);
4870  vpmsumw(VR16, VR16, VR17);
4871  vxor(VR0, VR0, VR16);
4872
4873  // Now xor all the parallel chunks together
4874  BIND(L_XOR);
4875  vxor(VR0, VR0, VR1);
4876  vxor(VR2, VR2, VR3);
4877  vxor(VR4, VR4, VR5);
4878  vxor(VR6, VR6, VR7);
4879
4880  vxor(VR0, VR0, VR2);
4881  vxor(VR4, VR4, VR6);
4882
4883  vxor(VR0, VR0, VR4);
4884
4885  b(L_barrett_reduction);
4886
4887  BIND(L_first_warm_up_done);
4888  lvx(const1, constantsPos);
4889  addi(constantsPos, constantsPos, 16);
4890  vpmsumd(VR8,  VR16, const1);
4891  vpmsumd(VR9,  VR17, const1);
4892  vpmsumd(VR10, VR18, const1);
4893  vpmsumd(VR11, VR19, const1);
4894  vpmsumd(VR12, VR20, const1);
4895  vpmsumd(VR13, VR21, const1);
4896  vpmsumd(VR14, VR22, const1);
4897  vpmsumd(VR15, VR23, const1);
4898  b(L_second_cool_down);
4899
4900  BIND(L_barrett_reduction);
4901
4902  lvx(const1, barretConstants);
4903  addi(barretConstants, barretConstants, 16);
4904  lvx(const2, barretConstants);
4905
4906  vsldoi(VR1, VR0, VR0, -8);
4907  vxor(VR0, VR0, VR1);    // xor two 64 bit results together
4908
4909  // shift left one bit
4910  vspltisb(VR1, 1);
4911  vsl(VR0, VR0, VR1);
4912
4913  vand(VR0, VR0, mask_64bit);
4914
4915  /*
4916   * The reflected version of Barrett reduction. Instead of bit
4917   * reflecting our data (which is expensive to do), we bit reflect our
4918   * constants and our algorithm, which means the intermediate data in
4919   * our vector registers goes from 0-63 instead of 63-0. We can reflect
4920   * the algorithm because we don't carry in mod 2 arithmetic.
4921   */
4922  vand(VR1, VR0, mask_32bit);  // bottom 32 bits of a
4923  vpmsumd(VR1, VR1, const1);   // ma
4924  vand(VR1, VR1, mask_32bit);  // bottom 32bits of ma
4925  vpmsumd(VR1, VR1, const2);   // qn */
4926  vxor(VR0, VR0, VR1);         // a - qn, subtraction is xor in GF(2)
4927
4928  /*
4929   * Since we are bit reflected, the result (ie the low 32 bits) is in
4930   * the high 32 bits. We just need to shift it left 4 bytes
4931   * V0 [ 0 1 X 3 ]
4932   * V0 [ 0 X 2 3 ]
4933   */
4934  vsldoi(VR0, VR0, zeroes, 4);    // shift result into top 64 bits of
4935
4936  // Get it into r3
4937  mfvrd(crc, VR0);
4938
4939  BIND(L_end);
4940
4941  offsetInt = 0;
4942  // Restore non-volatile Vector registers (frameless).
4943  offsetInt -= 16; li(offset, -16);           lvx(VR20, offset, R1_SP);
4944  offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
4945  offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
4946  offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
4947  offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
4948  offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
4949  offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
4950  offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
4951  offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
4952  offsetInt -= 8;  ld(R22, offsetInt, R1_SP);
4953  offsetInt -= 8;  ld(R23, offsetInt, R1_SP);
4954  offsetInt -= 8;  ld(R24, offsetInt, R1_SP);
4955  offsetInt -= 8;  ld(R25, offsetInt, R1_SP);
4956  offsetInt -= 8;  ld(R26, offsetInt, R1_SP);
4957  offsetInt -= 8;  ld(R27, offsetInt, R1_SP);
4958  offsetInt -= 8;  ld(R28, offsetInt, R1_SP);
4959  offsetInt -= 8;  ld(R29, offsetInt, R1_SP);
4960  offsetInt -= 8;  ld(R30, offsetInt, R1_SP);
4961  offsetInt -= 8;  ld(R31, offsetInt, R1_SP);
4962}
4963
4964void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
4965  assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
4966
4967  BLOCK_COMMENT("kernel_crc32_singleByte:");
4968  nand(crc, crc, crc);       // ~c
4969
4970  lbz(tmp, 0, buf);          // Byte from buffer, zero-extended.
4971  update_byte_crc32(crc, tmp, table);
4972
4973  nand(crc, crc, crc);       // ~c
4974}
4975
4976// dest_lo += src1 + src2
4977// dest_hi += carry1 + carry2
4978void MacroAssembler::add2_with_carry(Register dest_hi,
4979                                     Register dest_lo,
4980                                     Register src1, Register src2) {
4981  li(R0, 0);
4982  addc(dest_lo, dest_lo, src1);
4983  adde(dest_hi, dest_hi, R0);
4984  addc(dest_lo, dest_lo, src2);
4985  adde(dest_hi, dest_hi, R0);
4986}
4987
4988// Multiply 64 bit by 64 bit first loop.
4989void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4990                                           Register x_xstart,
4991                                           Register y, Register y_idx,
4992                                           Register z,
4993                                           Register carry,
4994                                           Register product_high, Register product,
4995                                           Register idx, Register kdx,
4996                                           Register tmp) {
4997  //  jlong carry, x[], y[], z[];
4998  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4999  //    huge_128 product = y[idx] * x[xstart] + carry;
5000  //    z[kdx] = (jlong)product;
5001  //    carry  = (jlong)(product >>> 64);
5002  //  }
5003  //  z[xstart] = carry;
5004
5005  Label L_first_loop, L_first_loop_exit;
5006  Label L_one_x, L_one_y, L_multiply;
5007
5008  addic_(xstart, xstart, -1);
5009  blt(CCR0, L_one_x);   // Special case: length of x is 1.
5010
5011  // Load next two integers of x.
5012  sldi(tmp, xstart, LogBytesPerInt);
5013  ldx(x_xstart, x, tmp);
5014#ifdef VM_LITTLE_ENDIAN
5015  rldicl(x_xstart, x_xstart, 32, 0);
5016#endif
5017
5018  align(32, 16);
5019  bind(L_first_loop);
5020
5021  cmpdi(CCR0, idx, 1);
5022  blt(CCR0, L_first_loop_exit);
5023  addi(idx, idx, -2);
5024  beq(CCR0, L_one_y);
5025
5026  // Load next two integers of y.
5027  sldi(tmp, idx, LogBytesPerInt);
5028  ldx(y_idx, y, tmp);
5029#ifdef VM_LITTLE_ENDIAN
5030  rldicl(y_idx, y_idx, 32, 0);
5031#endif
5032
5033
5034  bind(L_multiply);
5035  multiply64(product_high, product, x_xstart, y_idx);
5036
5037  li(tmp, 0);
5038  addc(product, product, carry);         // Add carry to result.
5039  adde(product_high, product_high, tmp); // Add carry of the last addition.
5040  addi(kdx, kdx, -2);
5041
5042  // Store result.
5043#ifdef VM_LITTLE_ENDIAN
5044  rldicl(product, product, 32, 0);
5045#endif
5046  sldi(tmp, kdx, LogBytesPerInt);
5047  stdx(product, z, tmp);
5048  mr_if_needed(carry, product_high);
5049  b(L_first_loop);
5050
5051
5052  bind(L_one_y); // Load one 32 bit portion of y as (0,value).
5053
5054  lwz(y_idx, 0, y);
5055  b(L_multiply);
5056
5057
5058  bind(L_one_x); // Load one 32 bit portion of x as (0,value).
5059
5060  lwz(x_xstart, 0, x);
5061  b(L_first_loop);
5062
5063  bind(L_first_loop_exit);
5064}
5065
5066// Multiply 64 bit by 64 bit and add 128 bit.
5067void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
5068                                            Register z, Register yz_idx,
5069                                            Register idx, Register carry,
5070                                            Register product_high, Register product,
5071                                            Register tmp, int offset) {
5072
5073  //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5074  //  z[kdx] = (jlong)product;
5075
5076  sldi(tmp, idx, LogBytesPerInt);
5077  if (offset) {
5078    addi(tmp, tmp, offset);
5079  }
5080  ldx(yz_idx, y, tmp);
5081#ifdef VM_LITTLE_ENDIAN
5082  rldicl(yz_idx, yz_idx, 32, 0);
5083#endif
5084
5085  multiply64(product_high, product, x_xstart, yz_idx);
5086  ldx(yz_idx, z, tmp);
5087#ifdef VM_LITTLE_ENDIAN
5088  rldicl(yz_idx, yz_idx, 32, 0);
5089#endif
5090
5091  add2_with_carry(product_high, product, carry, yz_idx);
5092
5093  sldi(tmp, idx, LogBytesPerInt);
5094  if (offset) {
5095    addi(tmp, tmp, offset);
5096  }
5097#ifdef VM_LITTLE_ENDIAN
5098  rldicl(product, product, 32, 0);
5099#endif
5100  stdx(product, z, tmp);
5101}
5102
5103// Multiply 128 bit by 128 bit. Unrolled inner loop.
5104void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
5105                                             Register y, Register z,
5106                                             Register yz_idx, Register idx, Register carry,
5107                                             Register product_high, Register product,
5108                                             Register carry2, Register tmp) {
5109
5110  //  jlong carry, x[], y[], z[];
5111  //  int kdx = ystart+1;
5112  //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5113  //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5114  //    z[kdx+idx+1] = (jlong)product;
5115  //    jlong carry2 = (jlong)(product >>> 64);
5116  //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
5117  //    z[kdx+idx] = (jlong)product;
5118  //    carry = (jlong)(product >>> 64);
5119  //  }
5120  //  idx += 2;
5121  //  if (idx > 0) {
5122  //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
5123  //    z[kdx+idx] = (jlong)product;
5124  //    carry = (jlong)(product >>> 64);
5125  //  }
5126
5127  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5128  const Register jdx = R0;
5129
5130  // Scale the index.
5131  srdi_(jdx, idx, 2);
5132  beq(CCR0, L_third_loop_exit);
5133  mtctr(jdx);
5134
5135  align(32, 16);
5136  bind(L_third_loop);
5137
5138  addi(idx, idx, -4);
5139
5140  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
5141  mr_if_needed(carry2, product_high);
5142
5143  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
5144  mr_if_needed(carry, product_high);
5145  bdnz(L_third_loop);
5146
5147  bind(L_third_loop_exit);  // Handle any left-over operand parts.
5148
5149  andi_(idx, idx, 0x3);
5150  beq(CCR0, L_post_third_loop_done);
5151
5152  Label L_check_1;
5153
5154  addic_(idx, idx, -2);
5155  blt(CCR0, L_check_1);
5156
5157  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
5158  mr_if_needed(carry, product_high);
5159
5160  bind(L_check_1);
5161
5162  addi(idx, idx, 0x2);
5163  andi_(idx, idx, 0x1);
5164  addic_(idx, idx, -1);
5165  blt(CCR0, L_post_third_loop_done);
5166
5167  sldi(tmp, idx, LogBytesPerInt);
5168  lwzx(yz_idx, y, tmp);
5169  multiply64(product_high, product, x_xstart, yz_idx);
5170  lwzx(yz_idx, z, tmp);
5171
5172  add2_with_carry(product_high, product, yz_idx, carry);
5173
5174  sldi(tmp, idx, LogBytesPerInt);
5175  stwx(product, z, tmp);
5176  srdi(product, product, 32);
5177
5178  sldi(product_high, product_high, 32);
5179  orr(product, product, product_high);
5180  mr_if_needed(carry, product);
5181
5182  bind(L_post_third_loop_done);
5183}   // multiply_128_x_128_loop
5184
5185void MacroAssembler::multiply_to_len(Register x, Register xlen,
5186                                     Register y, Register ylen,
5187                                     Register z, Register zlen,
5188                                     Register tmp1, Register tmp2,
5189                                     Register tmp3, Register tmp4,
5190                                     Register tmp5, Register tmp6,
5191                                     Register tmp7, Register tmp8,
5192                                     Register tmp9, Register tmp10,
5193                                     Register tmp11, Register tmp12,
5194                                     Register tmp13) {
5195
5196  ShortBranchVerifier sbv(this);
5197
5198  assert_different_registers(x, xlen, y, ylen, z, zlen,
5199                             tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5200  assert_different_registers(x, xlen, y, ylen, z, zlen,
5201                             tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
5202  assert_different_registers(x, xlen, y, ylen, z, zlen,
5203                             tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
5204
5205  const Register idx = tmp1;
5206  const Register kdx = tmp2;
5207  const Register xstart = tmp3;
5208
5209  const Register y_idx = tmp4;
5210  const Register carry = tmp5;
5211  const Register product = tmp6;
5212  const Register product_high = tmp7;
5213  const Register x_xstart = tmp8;
5214  const Register tmp = tmp9;
5215
5216  // First Loop.
5217  //
5218  //  final static long LONG_MASK = 0xffffffffL;
5219  //  int xstart = xlen - 1;
5220  //  int ystart = ylen - 1;
5221  //  long carry = 0;
5222  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5223  //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
5224  //    z[kdx] = (int)product;
5225  //    carry = product >>> 32;
5226  //  }
5227  //  z[xstart] = (int)carry;
5228
5229  mr_if_needed(idx, ylen);        // idx = ylen
5230  mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
5231  li(carry, 0);                   // carry = 0
5232
5233  Label L_done;
5234
5235  addic_(xstart, xlen, -1);
5236  blt(CCR0, L_done);
5237
5238  multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
5239                        carry, product_high, product, idx, kdx, tmp);
5240
5241  Label L_second_loop;
5242
5243  cmpdi(CCR0, kdx, 0);
5244  beq(CCR0, L_second_loop);
5245
5246  Label L_carry;
5247
5248  addic_(kdx, kdx, -1);
5249  beq(CCR0, L_carry);
5250
5251  // Store lower 32 bits of carry.
5252  sldi(tmp, kdx, LogBytesPerInt);
5253  stwx(carry, z, tmp);
5254  srdi(carry, carry, 32);
5255  addi(kdx, kdx, -1);
5256
5257
5258  bind(L_carry);
5259
5260  // Store upper 32 bits of carry.
5261  sldi(tmp, kdx, LogBytesPerInt);
5262  stwx(carry, z, tmp);
5263
5264  // Second and third (nested) loops.
5265  //
5266  //  for (int i = xstart-1; i >= 0; i--) { // Second loop
5267  //    carry = 0;
5268  //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5269  //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5270  //                     (z[k] & LONG_MASK) + carry;
5271  //      z[k] = (int)product;
5272  //      carry = product >>> 32;
5273  //    }
5274  //    z[i] = (int)carry;
5275  //  }
5276  //
5277  //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
5278
5279  bind(L_second_loop);
5280
5281  li(carry, 0);                   // carry = 0;
5282
5283  addic_(xstart, xstart, -1);     // i = xstart-1;
5284  blt(CCR0, L_done);
5285
5286  Register zsave = tmp10;
5287
5288  mr(zsave, z);
5289
5290
5291  Label L_last_x;
5292
5293  sldi(tmp, xstart, LogBytesPerInt);
5294  add(z, z, tmp);                 // z = z + k - j
5295  addi(z, z, 4);
5296  addic_(xstart, xstart, -1);     // i = xstart-1;
5297  blt(CCR0, L_last_x);
5298
5299  sldi(tmp, xstart, LogBytesPerInt);
5300  ldx(x_xstart, x, tmp);
5301#ifdef VM_LITTLE_ENDIAN
5302  rldicl(x_xstart, x_xstart, 32, 0);
5303#endif
5304
5305
5306  Label L_third_loop_prologue;
5307
5308  bind(L_third_loop_prologue);
5309
5310  Register xsave = tmp11;
5311  Register xlensave = tmp12;
5312  Register ylensave = tmp13;
5313
5314  mr(xsave, x);
5315  mr(xlensave, xstart);
5316  mr(ylensave, ylen);
5317
5318
5319  multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
5320                          carry, product_high, product, x, tmp);
5321
5322  mr(z, zsave);
5323  mr(x, xsave);
5324  mr(xlen, xlensave);   // This is the decrement of the loop counter!
5325  mr(ylen, ylensave);
5326
5327  addi(tmp3, xlen, 1);
5328  sldi(tmp, tmp3, LogBytesPerInt);
5329  stwx(carry, z, tmp);
5330  addic_(tmp3, tmp3, -1);
5331  blt(CCR0, L_done);
5332
5333  srdi(carry, carry, 32);
5334  sldi(tmp, tmp3, LogBytesPerInt);
5335  stwx(carry, z, tmp);
5336  b(L_second_loop);
5337
5338  // Next infrequent code is moved outside loops.
5339  bind(L_last_x);
5340
5341  lwz(x_xstart, 0, x);
5342  b(L_third_loop_prologue);
5343
5344  bind(L_done);
5345}   // multiply_to_len
5346
5347void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
5348#ifdef ASSERT
5349  Label ok;
5350  if (check_equal) {
5351    beq(CCR0, ok);
5352  } else {
5353    bne(CCR0, ok);
5354  }
5355  stop(msg, id);
5356  bind(ok);
5357#endif
5358}
5359
5360void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
5361                                          Register mem_base, const char* msg, int id) {
5362#ifdef ASSERT
5363  switch (size) {
5364    case 4:
5365      lwz(R0, mem_offset, mem_base);
5366      cmpwi(CCR0, R0, 0);
5367      break;
5368    case 8:
5369      ld(R0, mem_offset, mem_base);
5370      cmpdi(CCR0, R0, 0);
5371      break;
5372    default:
5373      ShouldNotReachHere();
5374  }
5375  asm_assert(check_equal, msg, id);
5376#endif // ASSERT
5377}
5378
5379void MacroAssembler::verify_thread() {
5380  if (VerifyThread) {
5381    unimplemented("'VerifyThread' currently not implemented on PPC");
5382  }
5383}
5384
5385// READ: oop. KILL: R0. Volatile floats perhaps.
5386void MacroAssembler::verify_oop(Register oop, const char* msg) {
5387  if (!VerifyOops) {
5388    return;
5389  }
5390
5391  address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5392  const Register tmp = R11; // Will be preserved.
5393  const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5394  save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5395
5396  mr_if_needed(R4_ARG2, oop);
5397  save_LR_CR(tmp); // save in old frame
5398  push_frame_reg_args(nbytes_save, tmp);
5399  // load FunctionDescriptor** / entry_address *
5400  load_const_optimized(tmp, fd, R0);
5401  // load FunctionDescriptor* / entry_address
5402  ld(tmp, 0, tmp);
5403  load_const_optimized(R3_ARG1, (address)msg, R0);
5404  // Call destination for its side effect.
5405  call_c(tmp);
5406
5407  pop_frame();
5408  restore_LR_CR(tmp);
5409  restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5410}
5411
5412void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
5413  if (!VerifyOops) {
5414    return;
5415  }
5416
5417  address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5418  const Register tmp = R11; // Will be preserved.
5419  const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5420  save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5421
5422  ld(R4_ARG2, offs, base);
5423  save_LR_CR(tmp); // save in old frame
5424  push_frame_reg_args(nbytes_save, tmp);
5425  // load FunctionDescriptor** / entry_address *
5426  load_const_optimized(tmp, fd, R0);
5427  // load FunctionDescriptor* / entry_address
5428  ld(tmp, 0, tmp);
5429  load_const_optimized(R3_ARG1, (address)msg, R0);
5430  // Call destination for its side effect.
5431  call_c(tmp);
5432
5433  pop_frame();
5434  restore_LR_CR(tmp);
5435  restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5436}
5437
5438const char* stop_types[] = {
5439  "stop",
5440  "untested",
5441  "unimplemented",
5442  "shouldnotreachhere"
5443};
5444
5445static void stop_on_request(int tp, const char* msg) {
5446  tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
5447  guarantee(false, "PPC assembly code requires stop: %s", msg);
5448}
5449
5450// Call a C-function that prints output.
5451void MacroAssembler::stop(int type, const char* msg, int id) {
5452#ifndef PRODUCT
5453  block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
5454#else
5455  block_comment("stop {");
5456#endif
5457
5458  // setup arguments
5459  load_const_optimized(R3_ARG1, type);
5460  load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
5461  call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
5462  illtrap();
5463  emit_int32(id);
5464  block_comment("} stop;");
5465}
5466
5467#ifndef PRODUCT
5468// Write pattern 0x0101010101010101 in memory region [low-before, high+after].
5469// Val, addr are temp registers.
5470// If low == addr, addr is killed.
5471// High is preserved.
5472void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
5473  if (!ZapMemory) return;
5474
5475  assert_different_registers(low, val);
5476
5477  BLOCK_COMMENT("zap memory region {");
5478  load_const_optimized(val, 0x0101010101010101);
5479  int size = before + after;
5480  if (low == high && size < 5 && size > 0) {
5481    int offset = -before*BytesPerWord;
5482    for (int i = 0; i < size; ++i) {
5483      std(val, offset, low);
5484      offset += (1*BytesPerWord);
5485    }
5486  } else {
5487    addi(addr, low, -before*BytesPerWord);
5488    assert_different_registers(high, val);
5489    if (after) addi(high, high, after * BytesPerWord);
5490    Label loop;
5491    bind(loop);
5492    std(val, 0, addr);
5493    addi(addr, addr, 8);
5494    cmpd(CCR6, addr, high);
5495    ble(CCR6, loop);
5496    if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
5497  }
5498  BLOCK_COMMENT("} zap memory region");
5499}
5500
5501#endif // !PRODUCT
5502
5503SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
5504  int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
5505  assert(sizeof(bool) == 1, "PowerPC ABI");
5506  masm->lbz(temp, simm16_offset, temp);
5507  masm->cmpwi(CCR0, temp, 0);
5508  masm->beq(CCR0, _label);
5509}
5510
5511SkipIfEqualZero::~SkipIfEqualZero() {
5512  _masm->bind(_label);
5513}
5514