macroAssembler_ppc.cpp revision 9149:a8a8604f890f
1/*
2 * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
3 * Copyright 2012, 2015 SAP AG. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26#include "precompiled.hpp"
27#include "asm/macroAssembler.inline.hpp"
28#include "compiler/disassembler.hpp"
29#include "gc/shared/cardTableModRefBS.hpp"
30#include "gc/shared/collectedHeap.inline.hpp"
31#include "interpreter/interpreter.hpp"
32#include "memory/resourceArea.hpp"
33#include "prims/methodHandles.hpp"
34#include "runtime/biasedLocking.hpp"
35#include "runtime/icache.hpp"
36#include "runtime/interfaceSupport.hpp"
37#include "runtime/objectMonitor.hpp"
38#include "runtime/os.hpp"
39#include "runtime/sharedRuntime.hpp"
40#include "runtime/stubRoutines.hpp"
41#include "utilities/macros.hpp"
42#if INCLUDE_ALL_GCS
43#include "gc/g1/g1CollectedHeap.inline.hpp"
44#include "gc/g1/g1SATBCardTableModRefBS.hpp"
45#include "gc/g1/heapRegion.hpp"
46#endif // INCLUDE_ALL_GCS
47
48#ifdef PRODUCT
49#define BLOCK_COMMENT(str) // nothing
50#else
51#define BLOCK_COMMENT(str) block_comment(str)
52#endif
53#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
54
55#ifdef ASSERT
56// On RISC, there's no benefit to verifying instruction boundaries.
57bool AbstractAssembler::pd_check_instruction_mark() { return false; }
58#endif
59
60void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
61  assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
62  if (Assembler::is_simm(si31, 16)) {
63    ld(d, si31, a);
64    if (emit_filler_nop) nop();
65  } else {
66    const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
67    const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
68    addis(d, a, hi);
69    ld(d, lo, d);
70  }
71}
72
73void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
74  assert_different_registers(d, a);
75  ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
76}
77
78void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
79                                      size_t size_in_bytes, bool is_signed) {
80  switch (size_in_bytes) {
81  case  8:              ld(dst, offs, base);                         break;
82  case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
83  case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
84  case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
85  default:  ShouldNotReachHere();
86  }
87}
88
89void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
90                                       size_t size_in_bytes) {
91  switch (size_in_bytes) {
92  case  8:  std(dst, offs, base); break;
93  case  4:  stw(dst, offs, base); break;
94  case  2:  sth(dst, offs, base); break;
95  case  1:  stb(dst, offs, base); break;
96  default:  ShouldNotReachHere();
97  }
98}
99
100void MacroAssembler::align(int modulus, int max, int rem) {
101  int padding = (rem + modulus - (offset() % modulus)) % modulus;
102  if (padding > max) return;
103  for (int c = (padding >> 2); c > 0; --c) { nop(); }
104}
105
106// Issue instructions that calculate given TOC from global TOC.
107void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
108                                                       bool add_relocation, bool emit_dummy_addr) {
109  int offset = -1;
110  if (emit_dummy_addr) {
111    offset = -128; // dummy address
112  } else if (addr != (address)(intptr_t)-1) {
113    offset = MacroAssembler::offset_to_global_toc(addr);
114  }
115
116  if (hi16) {
117    addis(dst, R29, MacroAssembler::largeoffset_si16_si16_hi(offset));
118  }
119  if (lo16) {
120    if (add_relocation) {
121      // Relocate at the addi to avoid confusion with a load from the method's TOC.
122      relocate(internal_word_Relocation::spec(addr));
123    }
124    addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
125  }
126}
127
128int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
129  const int offset = MacroAssembler::offset_to_global_toc(addr);
130
131  const address inst2_addr = a;
132  const int inst2 = *(int *)inst2_addr;
133
134  // The relocation points to the second instruction, the addi,
135  // and the addi reads and writes the same register dst.
136  const int dst = inv_rt_field(inst2);
137  assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
138
139  // Now, find the preceding addis which writes to dst.
140  int inst1 = 0;
141  address inst1_addr = inst2_addr - BytesPerInstWord;
142  while (inst1_addr >= bound) {
143    inst1 = *(int *) inst1_addr;
144    if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
145      // Stop, found the addis which writes dst.
146      break;
147    }
148    inst1_addr -= BytesPerInstWord;
149  }
150
151  assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
152  set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
153  set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
154  return (int)((intptr_t)addr - (intptr_t)inst1_addr);
155}
156
157address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
158  const address inst2_addr = a;
159  const int inst2 = *(int *)inst2_addr;
160
161  // The relocation points to the second instruction, the addi,
162  // and the addi reads and writes the same register dst.
163  const int dst = inv_rt_field(inst2);
164  assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
165
166  // Now, find the preceding addis which writes to dst.
167  int inst1 = 0;
168  address inst1_addr = inst2_addr - BytesPerInstWord;
169  while (inst1_addr >= bound) {
170    inst1 = *(int *) inst1_addr;
171    if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
172      // stop, found the addis which writes dst
173      break;
174    }
175    inst1_addr -= BytesPerInstWord;
176  }
177
178  assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
179
180  int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
181  // -1 is a special case
182  if (offset == -1) {
183    return (address)(intptr_t)-1;
184  } else {
185    return global_toc() + offset;
186  }
187}
188
189#ifdef _LP64
190// Patch compressed oops or klass constants.
191// Assembler sequence is
192// 1) compressed oops:
193//    lis  rx = const.hi
194//    ori rx = rx | const.lo
195// 2) compressed klass:
196//    lis  rx = const.hi
197//    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
198//    ori rx = rx | const.lo
199// Clrldi will be passed by.
200int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
201  assert(UseCompressedOops, "Should only patch compressed oops");
202
203  const address inst2_addr = a;
204  const int inst2 = *(int *)inst2_addr;
205
206  // The relocation points to the second instruction, the ori,
207  // and the ori reads and writes the same register dst.
208  const int dst = inv_rta_field(inst2);
209  assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
210  // Now, find the preceding addis which writes to dst.
211  int inst1 = 0;
212  address inst1_addr = inst2_addr - BytesPerInstWord;
213  bool inst1_found = false;
214  while (inst1_addr >= bound) {
215    inst1 = *(int *)inst1_addr;
216    if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
217    inst1_addr -= BytesPerInstWord;
218  }
219  assert(inst1_found, "inst is not lis");
220
221  int xc = (data >> 16) & 0xffff;
222  int xd = (data >>  0) & 0xffff;
223
224  set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
225  set_imm((int *)inst2_addr,        (xd)); // unsigned int
226  return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
227}
228
229// Get compressed oop or klass constant.
230narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
231  assert(UseCompressedOops, "Should only patch compressed oops");
232
233  const address inst2_addr = a;
234  const int inst2 = *(int *)inst2_addr;
235
236  // The relocation points to the second instruction, the ori,
237  // and the ori reads and writes the same register dst.
238  const int dst = inv_rta_field(inst2);
239  assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
240  // Now, find the preceding lis which writes to dst.
241  int inst1 = 0;
242  address inst1_addr = inst2_addr - BytesPerInstWord;
243  bool inst1_found = false;
244
245  while (inst1_addr >= bound) {
246    inst1 = *(int *) inst1_addr;
247    if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
248    inst1_addr -= BytesPerInstWord;
249  }
250  assert(inst1_found, "inst is not lis");
251
252  uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
253  uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
254
255  return (int) (xl | xh);
256}
257#endif // _LP64
258
259void MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, Register toc) {
260  int toc_offset = 0;
261  // Use RelocationHolder::none for the constant pool entry, otherwise
262  // we will end up with a failing NativeCall::verify(x) where x is
263  // the address of the constant pool entry.
264  // FIXME: We should insert relocation information for oops at the constant
265  // pool entries instead of inserting it at the loads; patching of a constant
266  // pool entry should be less expensive.
267  address oop_address = address_constant((address)a.value(), RelocationHolder::none);
268  // Relocate at the pc of the load.
269  relocate(a.rspec());
270  toc_offset = (int)(oop_address - code()->consts()->start());
271  ld_largeoffset_unchecked(dst, toc_offset, toc, true);
272}
273
274bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
275  const address inst1_addr = a;
276  const int inst1 = *(int *)inst1_addr;
277
278   // The relocation points to the ld or the addis.
279   return (is_ld(inst1)) ||
280          (is_addis(inst1) && inv_ra_field(inst1) != 0);
281}
282
283int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
284  assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
285
286  const address inst1_addr = a;
287  const int inst1 = *(int *)inst1_addr;
288
289  if (is_ld(inst1)) {
290    return inv_d1_field(inst1);
291  } else if (is_addis(inst1)) {
292    const int dst = inv_rt_field(inst1);
293
294    // Now, find the succeeding ld which reads and writes to dst.
295    address inst2_addr = inst1_addr + BytesPerInstWord;
296    int inst2 = 0;
297    while (true) {
298      inst2 = *(int *) inst2_addr;
299      if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
300        // Stop, found the ld which reads and writes dst.
301        break;
302      }
303      inst2_addr += BytesPerInstWord;
304    }
305    return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
306  }
307  ShouldNotReachHere();
308  return 0;
309}
310
311// Get the constant from a `load_const' sequence.
312long MacroAssembler::get_const(address a) {
313  assert(is_load_const_at(a), "not a load of a constant");
314  const int *p = (const int*) a;
315  unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
316  if (is_ori(*(p+1))) {
317    x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
318    x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
319    x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
320  } else if (is_lis(*(p+1))) {
321    x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
322    x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
323    x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
324  } else {
325    ShouldNotReachHere();
326    return (long) 0;
327  }
328  return (long) x;
329}
330
331// Patch the 64 bit constant of a `load_const' sequence. This is a low
332// level procedure. It neither flushes the instruction cache nor is it
333// mt safe.
334void MacroAssembler::patch_const(address a, long x) {
335  assert(is_load_const_at(a), "not a load of a constant");
336  int *p = (int*) a;
337  if (is_ori(*(p+1))) {
338    set_imm(0 + p, (x >> 48) & 0xffff);
339    set_imm(1 + p, (x >> 32) & 0xffff);
340    set_imm(3 + p, (x >> 16) & 0xffff);
341    set_imm(4 + p, x & 0xffff);
342  } else if (is_lis(*(p+1))) {
343    set_imm(0 + p, (x >> 48) & 0xffff);
344    set_imm(2 + p, (x >> 32) & 0xffff);
345    set_imm(1 + p, (x >> 16) & 0xffff);
346    set_imm(3 + p, x & 0xffff);
347  } else {
348    ShouldNotReachHere();
349  }
350}
351
352AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
353  assert(oop_recorder() != NULL, "this assembler needs a Recorder");
354  int index = oop_recorder()->allocate_metadata_index(obj);
355  RelocationHolder rspec = metadata_Relocation::spec(index);
356  return AddressLiteral((address)obj, rspec);
357}
358
359AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
360  assert(oop_recorder() != NULL, "this assembler needs a Recorder");
361  int index = oop_recorder()->find_index(obj);
362  RelocationHolder rspec = metadata_Relocation::spec(index);
363  return AddressLiteral((address)obj, rspec);
364}
365
366AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
367  assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
368  int oop_index = oop_recorder()->allocate_oop_index(obj);
369  return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
370}
371
372AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
373  assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
374  int oop_index = oop_recorder()->find_index(obj);
375  return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
376}
377
378RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
379                                                      Register tmp, int offset) {
380  intptr_t value = *delayed_value_addr;
381  if (value != 0) {
382    return RegisterOrConstant(value + offset);
383  }
384
385  // Load indirectly to solve generation ordering problem.
386  // static address, no relocation
387  int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
388  ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
389
390  if (offset != 0) {
391    addi(tmp, tmp, offset);
392  }
393
394  return RegisterOrConstant(tmp);
395}
396
397#ifndef PRODUCT
398void MacroAssembler::pd_print_patched_instruction(address branch) {
399  Unimplemented(); // TODO: PPC port
400}
401#endif // ndef PRODUCT
402
403// Conditional far branch for destinations encodable in 24+2 bits.
404void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
405
406  // If requested by flag optimize, relocate the bc_far as a
407  // runtime_call and prepare for optimizing it when the code gets
408  // relocated.
409  if (optimize == bc_far_optimize_on_relocate) {
410    relocate(relocInfo::runtime_call_type);
411  }
412
413  // variant 2:
414  //
415  //    b!cxx SKIP
416  //    bxx   DEST
417  //  SKIP:
418  //
419
420  const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
421                                                opposite_bcond(inv_boint_bcond(boint)));
422
423  // We emit two branches.
424  // First, a conditional branch which jumps around the far branch.
425  const address not_taken_pc = pc() + 2 * BytesPerInstWord;
426  const address bc_pc        = pc();
427  bc(opposite_boint, biint, not_taken_pc);
428
429  const int bc_instr = *(int*)bc_pc;
430  assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
431  assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
432  assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
433                                     opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
434         "postcondition");
435  assert(biint == inv_bi_field(bc_instr), "postcondition");
436
437  // Second, an unconditional far branch which jumps to dest.
438  // Note: target(dest) remembers the current pc (see CodeSection::target)
439  //       and returns the current pc if the label is not bound yet; when
440  //       the label gets bound, the unconditional far branch will be patched.
441  const address target_pc = target(dest);
442  const address b_pc  = pc();
443  b(target_pc);
444
445  assert(not_taken_pc == pc(),                     "postcondition");
446  assert(dest.is_bound() || target_pc == b_pc, "postcondition");
447}
448
449bool MacroAssembler::is_bc_far_at(address instruction_addr) {
450  return is_bc_far_variant1_at(instruction_addr) ||
451         is_bc_far_variant2_at(instruction_addr) ||
452         is_bc_far_variant3_at(instruction_addr);
453}
454
455address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
456  if (is_bc_far_variant1_at(instruction_addr)) {
457    const address instruction_1_addr = instruction_addr;
458    const int instruction_1 = *(int*)instruction_1_addr;
459    return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
460  } else if (is_bc_far_variant2_at(instruction_addr)) {
461    const address instruction_2_addr = instruction_addr + 4;
462    return bxx_destination(instruction_2_addr);
463  } else if (is_bc_far_variant3_at(instruction_addr)) {
464    return instruction_addr + 8;
465  }
466  // variant 4 ???
467  ShouldNotReachHere();
468  return NULL;
469}
470void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
471
472  if (is_bc_far_variant3_at(instruction_addr)) {
473    // variant 3, far cond branch to the next instruction, already patched to nops:
474    //
475    //    nop
476    //    endgroup
477    //  SKIP/DEST:
478    //
479    return;
480  }
481
482  // first, extract boint and biint from the current branch
483  int boint = 0;
484  int biint = 0;
485
486  ResourceMark rm;
487  const int code_size = 2 * BytesPerInstWord;
488  CodeBuffer buf(instruction_addr, code_size);
489  MacroAssembler masm(&buf);
490  if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
491    // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
492    masm.nop();
493    masm.endgroup();
494  } else {
495    if (is_bc_far_variant1_at(instruction_addr)) {
496      // variant 1, the 1st instruction contains the destination address:
497      //
498      //    bcxx  DEST
499      //    endgroup
500      //
501      const int instruction_1 = *(int*)(instruction_addr);
502      boint = inv_bo_field(instruction_1);
503      biint = inv_bi_field(instruction_1);
504    } else if (is_bc_far_variant2_at(instruction_addr)) {
505      // variant 2, the 2nd instruction contains the destination address:
506      //
507      //    b!cxx SKIP
508      //    bxx   DEST
509      //  SKIP:
510      //
511      const int instruction_1 = *(int*)(instruction_addr);
512      boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
513          opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
514      biint = inv_bi_field(instruction_1);
515    } else {
516      // variant 4???
517      ShouldNotReachHere();
518    }
519
520    // second, set the new branch destination and optimize the code
521    if (dest != instruction_addr + 4 && // the bc_far is still unbound!
522        masm.is_within_range_of_bcxx(dest, instruction_addr)) {
523      // variant 1:
524      //
525      //    bcxx  DEST
526      //    endgroup
527      //
528      masm.bc(boint, biint, dest);
529      masm.endgroup();
530    } else {
531      // variant 2:
532      //
533      //    b!cxx SKIP
534      //    bxx   DEST
535      //  SKIP:
536      //
537      const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
538                                                    opposite_bcond(inv_boint_bcond(boint)));
539      const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
540      masm.bc(opposite_boint, biint, not_taken_pc);
541      masm.b(dest);
542    }
543  }
544  ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
545}
546
547// Emit a NOT mt-safe patchable 64 bit absolute call/jump.
548void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
549  // get current pc
550  uint64_t start_pc = (uint64_t) pc();
551
552  const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
553  const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
554
555  // relocate here
556  if (rt != relocInfo::none) {
557    relocate(rt);
558  }
559
560  if ( ReoptimizeCallSequences &&
561       (( link && is_within_range_of_b(dest, pc_of_bl)) ||
562        (!link && is_within_range_of_b(dest, pc_of_b)))) {
563    // variant 2:
564    // Emit an optimized, pc-relative call/jump.
565
566    if (link) {
567      // some padding
568      nop();
569      nop();
570      nop();
571      nop();
572      nop();
573      nop();
574
575      // do the call
576      assert(pc() == pc_of_bl, "just checking");
577      bl(dest, relocInfo::none);
578    } else {
579      // do the jump
580      assert(pc() == pc_of_b, "just checking");
581      b(dest, relocInfo::none);
582
583      // some padding
584      nop();
585      nop();
586      nop();
587      nop();
588      nop();
589      nop();
590    }
591
592    // Assert that we can identify the emitted call/jump.
593    assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
594           "can't identify emitted call");
595  } else {
596    // variant 1:
597    mr(R0, R11);  // spill R11 -> R0.
598
599    // Load the destination address into CTR,
600    // calculate destination relative to global toc.
601    calculate_address_from_global_toc(R11, dest, true, true, false);
602
603    mtctr(R11);
604    mr(R11, R0);  // spill R11 <- R0.
605    nop();
606
607    // do the call/jump
608    if (link) {
609      bctrl();
610    } else{
611      bctr();
612    }
613    // Assert that we can identify the emitted call/jump.
614    assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
615           "can't identify emitted call");
616  }
617
618  // Assert that we can identify the emitted call/jump.
619  assert(is_bxx64_patchable_at((address)start_pc, link),
620         "can't identify emitted call");
621  assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
622         "wrong encoding of dest address");
623}
624
625// Identify a bxx64_patchable instruction.
626bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
627  return is_bxx64_patchable_variant1b_at(instruction_addr, link)
628    //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
629      || is_bxx64_patchable_variant2_at(instruction_addr, link);
630}
631
632// Does the call64_patchable instruction use a pc-relative encoding of
633// the call destination?
634bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
635  // variant 2 is pc-relative
636  return is_bxx64_patchable_variant2_at(instruction_addr, link);
637}
638
639// Identify variant 1.
640bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
641  unsigned int* instr = (unsigned int*) instruction_addr;
642  return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
643      && is_mtctr(instr[5]) // mtctr
644    && is_load_const_at(instruction_addr);
645}
646
647// Identify variant 1b: load destination relative to global toc.
648bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
649  unsigned int* instr = (unsigned int*) instruction_addr;
650  return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
651    && is_mtctr(instr[3]) // mtctr
652    && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
653}
654
655// Identify variant 2.
656bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
657  unsigned int* instr = (unsigned int*) instruction_addr;
658  if (link) {
659    return is_bl (instr[6])  // bl dest is last
660      && is_nop(instr[0])  // nop
661      && is_nop(instr[1])  // nop
662      && is_nop(instr[2])  // nop
663      && is_nop(instr[3])  // nop
664      && is_nop(instr[4])  // nop
665      && is_nop(instr[5]); // nop
666  } else {
667    return is_b  (instr[0])  // b  dest is first
668      && is_nop(instr[1])  // nop
669      && is_nop(instr[2])  // nop
670      && is_nop(instr[3])  // nop
671      && is_nop(instr[4])  // nop
672      && is_nop(instr[5])  // nop
673      && is_nop(instr[6]); // nop
674  }
675}
676
677// Set dest address of a bxx64_patchable instruction.
678void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
679  ResourceMark rm;
680  int code_size = MacroAssembler::bxx64_patchable_size;
681  CodeBuffer buf(instruction_addr, code_size);
682  MacroAssembler masm(&buf);
683  masm.bxx64_patchable(dest, relocInfo::none, link);
684  ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
685}
686
687// Get dest address of a bxx64_patchable instruction.
688address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
689  if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
690    return (address) (unsigned long) get_const(instruction_addr);
691  } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
692    unsigned int* instr = (unsigned int*) instruction_addr;
693    if (link) {
694      const int instr_idx = 6; // bl is last
695      int branchoffset = branch_destination(instr[instr_idx], 0);
696      return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
697    } else {
698      const int instr_idx = 0; // b is first
699      int branchoffset = branch_destination(instr[instr_idx], 0);
700      return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
701    }
702  // Load dest relative to global toc.
703  } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
704    return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
705                                                               instruction_addr);
706  } else {
707    ShouldNotReachHere();
708    return NULL;
709  }
710}
711
712// Uses ordering which corresponds to ABI:
713//    _savegpr0_14:  std  r14,-144(r1)
714//    _savegpr0_15:  std  r15,-136(r1)
715//    _savegpr0_16:  std  r16,-128(r1)
716void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
717  std(R14, offset, dst);   offset += 8;
718  std(R15, offset, dst);   offset += 8;
719  std(R16, offset, dst);   offset += 8;
720  std(R17, offset, dst);   offset += 8;
721  std(R18, offset, dst);   offset += 8;
722  std(R19, offset, dst);   offset += 8;
723  std(R20, offset, dst);   offset += 8;
724  std(R21, offset, dst);   offset += 8;
725  std(R22, offset, dst);   offset += 8;
726  std(R23, offset, dst);   offset += 8;
727  std(R24, offset, dst);   offset += 8;
728  std(R25, offset, dst);   offset += 8;
729  std(R26, offset, dst);   offset += 8;
730  std(R27, offset, dst);   offset += 8;
731  std(R28, offset, dst);   offset += 8;
732  std(R29, offset, dst);   offset += 8;
733  std(R30, offset, dst);   offset += 8;
734  std(R31, offset, dst);   offset += 8;
735
736  stfd(F14, offset, dst);   offset += 8;
737  stfd(F15, offset, dst);   offset += 8;
738  stfd(F16, offset, dst);   offset += 8;
739  stfd(F17, offset, dst);   offset += 8;
740  stfd(F18, offset, dst);   offset += 8;
741  stfd(F19, offset, dst);   offset += 8;
742  stfd(F20, offset, dst);   offset += 8;
743  stfd(F21, offset, dst);   offset += 8;
744  stfd(F22, offset, dst);   offset += 8;
745  stfd(F23, offset, dst);   offset += 8;
746  stfd(F24, offset, dst);   offset += 8;
747  stfd(F25, offset, dst);   offset += 8;
748  stfd(F26, offset, dst);   offset += 8;
749  stfd(F27, offset, dst);   offset += 8;
750  stfd(F28, offset, dst);   offset += 8;
751  stfd(F29, offset, dst);   offset += 8;
752  stfd(F30, offset, dst);   offset += 8;
753  stfd(F31, offset, dst);
754}
755
756// Uses ordering which corresponds to ABI:
757//    _restgpr0_14:  ld   r14,-144(r1)
758//    _restgpr0_15:  ld   r15,-136(r1)
759//    _restgpr0_16:  ld   r16,-128(r1)
760void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
761  ld(R14, offset, src);   offset += 8;
762  ld(R15, offset, src);   offset += 8;
763  ld(R16, offset, src);   offset += 8;
764  ld(R17, offset, src);   offset += 8;
765  ld(R18, offset, src);   offset += 8;
766  ld(R19, offset, src);   offset += 8;
767  ld(R20, offset, src);   offset += 8;
768  ld(R21, offset, src);   offset += 8;
769  ld(R22, offset, src);   offset += 8;
770  ld(R23, offset, src);   offset += 8;
771  ld(R24, offset, src);   offset += 8;
772  ld(R25, offset, src);   offset += 8;
773  ld(R26, offset, src);   offset += 8;
774  ld(R27, offset, src);   offset += 8;
775  ld(R28, offset, src);   offset += 8;
776  ld(R29, offset, src);   offset += 8;
777  ld(R30, offset, src);   offset += 8;
778  ld(R31, offset, src);   offset += 8;
779
780  // FP registers
781  lfd(F14, offset, src);   offset += 8;
782  lfd(F15, offset, src);   offset += 8;
783  lfd(F16, offset, src);   offset += 8;
784  lfd(F17, offset, src);   offset += 8;
785  lfd(F18, offset, src);   offset += 8;
786  lfd(F19, offset, src);   offset += 8;
787  lfd(F20, offset, src);   offset += 8;
788  lfd(F21, offset, src);   offset += 8;
789  lfd(F22, offset, src);   offset += 8;
790  lfd(F23, offset, src);   offset += 8;
791  lfd(F24, offset, src);   offset += 8;
792  lfd(F25, offset, src);   offset += 8;
793  lfd(F26, offset, src);   offset += 8;
794  lfd(F27, offset, src);   offset += 8;
795  lfd(F28, offset, src);   offset += 8;
796  lfd(F29, offset, src);   offset += 8;
797  lfd(F30, offset, src);   offset += 8;
798  lfd(F31, offset, src);
799}
800
801// For verify_oops.
802void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
803  std(R2,  offset, dst);   offset += 8;
804  std(R3,  offset, dst);   offset += 8;
805  std(R4,  offset, dst);   offset += 8;
806  std(R5,  offset, dst);   offset += 8;
807  std(R6,  offset, dst);   offset += 8;
808  std(R7,  offset, dst);   offset += 8;
809  std(R8,  offset, dst);   offset += 8;
810  std(R9,  offset, dst);   offset += 8;
811  std(R10, offset, dst);   offset += 8;
812  std(R11, offset, dst);   offset += 8;
813  std(R12, offset, dst);
814}
815
816// For verify_oops.
817void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
818  ld(R2,  offset, src);   offset += 8;
819  ld(R3,  offset, src);   offset += 8;
820  ld(R4,  offset, src);   offset += 8;
821  ld(R5,  offset, src);   offset += 8;
822  ld(R6,  offset, src);   offset += 8;
823  ld(R7,  offset, src);   offset += 8;
824  ld(R8,  offset, src);   offset += 8;
825  ld(R9,  offset, src);   offset += 8;
826  ld(R10, offset, src);   offset += 8;
827  ld(R11, offset, src);   offset += 8;
828  ld(R12, offset, src);
829}
830
831void MacroAssembler::save_LR_CR(Register tmp) {
832  mfcr(tmp);
833  std(tmp, _abi(cr), R1_SP);
834  mflr(tmp);
835  std(tmp, _abi(lr), R1_SP);
836  // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
837}
838
839void MacroAssembler::restore_LR_CR(Register tmp) {
840  assert(tmp != R1_SP, "must be distinct");
841  ld(tmp, _abi(lr), R1_SP);
842  mtlr(tmp);
843  ld(tmp, _abi(cr), R1_SP);
844  mtcr(tmp);
845}
846
847address MacroAssembler::get_PC_trash_LR(Register result) {
848  Label L;
849  bl(L);
850  bind(L);
851  address lr_pc = pc();
852  mflr(result);
853  return lr_pc;
854}
855
856void MacroAssembler::resize_frame(Register offset, Register tmp) {
857#ifdef ASSERT
858  assert_different_registers(offset, tmp, R1_SP);
859  andi_(tmp, offset, frame::alignment_in_bytes-1);
860  asm_assert_eq("resize_frame: unaligned", 0x204);
861#endif
862
863  // tmp <- *(SP)
864  ld(tmp, _abi(callers_sp), R1_SP);
865  // addr <- SP + offset;
866  // *(addr) <- tmp;
867  // SP <- addr
868  stdux(tmp, R1_SP, offset);
869}
870
871void MacroAssembler::resize_frame(int offset, Register tmp) {
872  assert(is_simm(offset, 16), "too big an offset");
873  assert_different_registers(tmp, R1_SP);
874  assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
875  // tmp <- *(SP)
876  ld(tmp, _abi(callers_sp), R1_SP);
877  // addr <- SP + offset;
878  // *(addr) <- tmp;
879  // SP <- addr
880  stdu(tmp, offset, R1_SP);
881}
882
883void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
884  // (addr == tmp1) || (addr == tmp2) is allowed here!
885  assert(tmp1 != tmp2, "must be distinct");
886
887  // compute offset w.r.t. current stack pointer
888  // tmp_1 <- addr - SP (!)
889  subf(tmp1, R1_SP, addr);
890
891  // atomically update SP keeping back link.
892  resize_frame(tmp1/* offset */, tmp2/* tmp */);
893}
894
895void MacroAssembler::push_frame(Register bytes, Register tmp) {
896#ifdef ASSERT
897  assert(bytes != R0, "r0 not allowed here");
898  andi_(R0, bytes, frame::alignment_in_bytes-1);
899  asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
900#endif
901  neg(tmp, bytes);
902  stdux(R1_SP, R1_SP, tmp);
903}
904
905// Push a frame of size `bytes'.
906void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
907  long offset = align_addr(bytes, frame::alignment_in_bytes);
908  if (is_simm(-offset, 16)) {
909    stdu(R1_SP, -offset, R1_SP);
910  } else {
911    load_const(tmp, -offset);
912    stdux(R1_SP, R1_SP, tmp);
913  }
914}
915
916// Push a frame of size `bytes' plus abi_reg_args on top.
917void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
918  push_frame(bytes + frame::abi_reg_args_size, tmp);
919}
920
921// Setup up a new C frame with a spill area for non-volatile GPRs and
922// additional space for local variables.
923void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
924                                                      Register tmp) {
925  push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
926}
927
928// Pop current C frame.
929void MacroAssembler::pop_frame() {
930  ld(R1_SP, _abi(callers_sp), R1_SP);
931}
932
933#if defined(ABI_ELFv2)
934address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
935  // TODO(asmundak): make sure the caller uses R12 as function descriptor
936  // most of the times.
937  if (R12 != r_function_entry) {
938    mr(R12, r_function_entry);
939  }
940  mtctr(R12);
941  // Do a call or a branch.
942  if (and_link) {
943    bctrl();
944  } else {
945    bctr();
946  }
947  _last_calls_return_pc = pc();
948
949  return _last_calls_return_pc;
950}
951
952// Call a C function via a function descriptor and use full C
953// calling conventions. Updates and returns _last_calls_return_pc.
954address MacroAssembler::call_c(Register r_function_entry) {
955  return branch_to(r_function_entry, /*and_link=*/true);
956}
957
958// For tail calls: only branch, don't link, so callee returns to caller of this function.
959address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
960  return branch_to(r_function_entry, /*and_link=*/false);
961}
962
963address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
964  load_const(R12, function_entry, R0);
965  return branch_to(R12,  /*and_link=*/true);
966}
967
968#else
969// Generic version of a call to C function via a function descriptor
970// with variable support for C calling conventions (TOC, ENV, etc.).
971// Updates and returns _last_calls_return_pc.
972address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
973                                  bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
974  // we emit standard ptrgl glue code here
975  assert((function_descriptor != R0), "function_descriptor cannot be R0");
976
977  // retrieve necessary entries from the function descriptor
978  ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
979  mtctr(R0);
980
981  if (load_toc_of_callee) {
982    ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
983  }
984  if (load_env_of_callee) {
985    ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
986  } else if (load_toc_of_callee) {
987    li(R11, 0);
988  }
989
990  // do a call or a branch
991  if (and_link) {
992    bctrl();
993  } else {
994    bctr();
995  }
996  _last_calls_return_pc = pc();
997
998  return _last_calls_return_pc;
999}
1000
1001// Call a C function via a function descriptor and use full C calling
1002// conventions.
1003// We don't use the TOC in generated code, so there is no need to save
1004// and restore its value.
1005address MacroAssembler::call_c(Register fd) {
1006  return branch_to(fd, /*and_link=*/true,
1007                       /*save toc=*/false,
1008                       /*restore toc=*/false,
1009                       /*load toc=*/true,
1010                       /*load env=*/true);
1011}
1012
1013address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1014  return branch_to(fd, /*and_link=*/false,
1015                       /*save toc=*/false,
1016                       /*restore toc=*/false,
1017                       /*load toc=*/true,
1018                       /*load env=*/true);
1019}
1020
1021address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1022  if (rt != relocInfo::none) {
1023    // this call needs to be relocatable
1024    if (!ReoptimizeCallSequences
1025        || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1026        || fd == NULL   // support code-size estimation
1027        || !fd->is_friend_function()
1028        || fd->entry() == NULL) {
1029      // it's not a friend function as defined by class FunctionDescriptor,
1030      // so do a full call-c here.
1031      load_const(R11, (address)fd, R0);
1032
1033      bool has_env = (fd != NULL && fd->env() != NULL);
1034      return branch_to(R11, /*and_link=*/true,
1035                            /*save toc=*/false,
1036                            /*restore toc=*/false,
1037                            /*load toc=*/true,
1038                            /*load env=*/has_env);
1039    } else {
1040      // It's a friend function. Load the entry point and don't care about
1041      // toc and env. Use an optimizable call instruction, but ensure the
1042      // same code-size as in the case of a non-friend function.
1043      nop();
1044      nop();
1045      nop();
1046      bl64_patchable(fd->entry(), rt);
1047      _last_calls_return_pc = pc();
1048      return _last_calls_return_pc;
1049    }
1050  } else {
1051    // This call does not need to be relocatable, do more aggressive
1052    // optimizations.
1053    if (!ReoptimizeCallSequences
1054      || !fd->is_friend_function()) {
1055      // It's not a friend function as defined by class FunctionDescriptor,
1056      // so do a full call-c here.
1057      load_const(R11, (address)fd, R0);
1058      return branch_to(R11, /*and_link=*/true,
1059                            /*save toc=*/false,
1060                            /*restore toc=*/false,
1061                            /*load toc=*/true,
1062                            /*load env=*/true);
1063    } else {
1064      // it's a friend function, load the entry point and don't care about
1065      // toc and env.
1066      address dest = fd->entry();
1067      if (is_within_range_of_b(dest, pc())) {
1068        bl(dest);
1069      } else {
1070        bl64_patchable(dest, rt);
1071      }
1072      _last_calls_return_pc = pc();
1073      return _last_calls_return_pc;
1074    }
1075  }
1076}
1077
1078// Call a C function.  All constants needed reside in TOC.
1079//
1080// Read the address to call from the TOC.
1081// Read env from TOC, if fd specifies an env.
1082// Read new TOC from TOC.
1083address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1084                                         relocInfo::relocType rt, Register toc) {
1085  if (!ReoptimizeCallSequences
1086    || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1087    || !fd->is_friend_function()) {
1088    // It's not a friend function as defined by class FunctionDescriptor,
1089    // so do a full call-c here.
1090    assert(fd->entry() != NULL, "function must be linked");
1091
1092    AddressLiteral fd_entry(fd->entry());
1093    load_const_from_method_toc(R11, fd_entry, toc);
1094    mtctr(R11);
1095    if (fd->env() == NULL) {
1096      li(R11, 0);
1097      nop();
1098    } else {
1099      AddressLiteral fd_env(fd->env());
1100      load_const_from_method_toc(R11, fd_env, toc);
1101    }
1102    AddressLiteral fd_toc(fd->toc());
1103    load_toc_from_toc(R2_TOC, fd_toc, toc);
1104    // R2_TOC is killed.
1105    bctrl();
1106    _last_calls_return_pc = pc();
1107  } else {
1108    // It's a friend function, load the entry point and don't care about
1109    // toc and env. Use an optimizable call instruction, but ensure the
1110    // same code-size as in the case of a non-friend function.
1111    nop();
1112    bl64_patchable(fd->entry(), rt);
1113    _last_calls_return_pc = pc();
1114  }
1115  return _last_calls_return_pc;
1116}
1117#endif // ABI_ELFv2
1118
1119void MacroAssembler::call_VM_base(Register oop_result,
1120                                  Register last_java_sp,
1121                                  address  entry_point,
1122                                  bool     check_exceptions) {
1123  BLOCK_COMMENT("call_VM {");
1124  // Determine last_java_sp register.
1125  if (!last_java_sp->is_valid()) {
1126    last_java_sp = R1_SP;
1127  }
1128  set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1129
1130  // ARG1 must hold thread address.
1131  mr(R3_ARG1, R16_thread);
1132#if defined(ABI_ELFv2)
1133  address return_pc = call_c(entry_point, relocInfo::none);
1134#else
1135  address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1136#endif
1137
1138  reset_last_Java_frame();
1139
1140  // Check for pending exceptions.
1141  if (check_exceptions) {
1142    // We don't check for exceptions here.
1143    ShouldNotReachHere();
1144  }
1145
1146  // Get oop result if there is one and reset the value in the thread.
1147  if (oop_result->is_valid()) {
1148    get_vm_result(oop_result);
1149  }
1150
1151  _last_calls_return_pc = return_pc;
1152  BLOCK_COMMENT("} call_VM");
1153}
1154
1155void MacroAssembler::call_VM_leaf_base(address entry_point) {
1156  BLOCK_COMMENT("call_VM_leaf {");
1157#if defined(ABI_ELFv2)
1158  call_c(entry_point, relocInfo::none);
1159#else
1160  call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1161#endif
1162  BLOCK_COMMENT("} call_VM_leaf");
1163}
1164
1165void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1166  call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1167}
1168
1169void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1170                             bool check_exceptions) {
1171  // R3_ARG1 is reserved for the thread.
1172  mr_if_needed(R4_ARG2, arg_1);
1173  call_VM(oop_result, entry_point, check_exceptions);
1174}
1175
1176void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1177                             bool check_exceptions) {
1178  // R3_ARG1 is reserved for the thread
1179  mr_if_needed(R4_ARG2, arg_1);
1180  assert(arg_2 != R4_ARG2, "smashed argument");
1181  mr_if_needed(R5_ARG3, arg_2);
1182  call_VM(oop_result, entry_point, check_exceptions);
1183}
1184
1185void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1186                             bool check_exceptions) {
1187  // R3_ARG1 is reserved for the thread
1188  mr_if_needed(R4_ARG2, arg_1);
1189  assert(arg_2 != R4_ARG2, "smashed argument");
1190  mr_if_needed(R5_ARG3, arg_2);
1191  mr_if_needed(R6_ARG4, arg_3);
1192  call_VM(oop_result, entry_point, check_exceptions);
1193}
1194
1195void MacroAssembler::call_VM_leaf(address entry_point) {
1196  call_VM_leaf_base(entry_point);
1197}
1198
1199void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1200  mr_if_needed(R3_ARG1, arg_1);
1201  call_VM_leaf(entry_point);
1202}
1203
1204void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1205  mr_if_needed(R3_ARG1, arg_1);
1206  assert(arg_2 != R3_ARG1, "smashed argument");
1207  mr_if_needed(R4_ARG2, arg_2);
1208  call_VM_leaf(entry_point);
1209}
1210
1211void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1212  mr_if_needed(R3_ARG1, arg_1);
1213  assert(arg_2 != R3_ARG1, "smashed argument");
1214  mr_if_needed(R4_ARG2, arg_2);
1215  assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1216  mr_if_needed(R5_ARG3, arg_3);
1217  call_VM_leaf(entry_point);
1218}
1219
1220// Check whether instruction is a read access to the polling page
1221// which was emitted by load_from_polling_page(..).
1222bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1223                                               address* polling_address_ptr) {
1224  if (!is_ld(instruction))
1225    return false; // It's not a ld. Fail.
1226
1227  int rt = inv_rt_field(instruction);
1228  int ra = inv_ra_field(instruction);
1229  int ds = inv_ds_field(instruction);
1230  if (!(ds == 0 && ra != 0 && rt == 0)) {
1231    return false; // It's not a ld(r0, X, ra). Fail.
1232  }
1233
1234  if (!ucontext) {
1235    // Set polling address.
1236    if (polling_address_ptr != NULL) {
1237      *polling_address_ptr = NULL;
1238    }
1239    return true; // No ucontext given. Can't check value of ra. Assume true.
1240  }
1241
1242#ifdef LINUX
1243  // Ucontext given. Check that register ra contains the address of
1244  // the safepoing polling page.
1245  ucontext_t* uc = (ucontext_t*) ucontext;
1246  // Set polling address.
1247  address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1248  if (polling_address_ptr != NULL) {
1249    *polling_address_ptr = addr;
1250  }
1251  return os::is_poll_address(addr);
1252#else
1253  // Not on Linux, ucontext must be NULL.
1254  ShouldNotReachHere();
1255  return false;
1256#endif
1257}
1258
1259bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1260#ifdef LINUX
1261  ucontext_t* uc = (ucontext_t*) ucontext;
1262
1263  if (is_stwx(instruction) || is_stwux(instruction)) {
1264    int ra = inv_ra_field(instruction);
1265    int rb = inv_rb_field(instruction);
1266
1267    // look up content of ra and rb in ucontext
1268    address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1269    long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1270    return os::is_memory_serialize_page(thread, ra_val+rb_val);
1271  } else if (is_stw(instruction) || is_stwu(instruction)) {
1272    int ra = inv_ra_field(instruction);
1273    int d1 = inv_d1_field(instruction);
1274
1275    // look up content of ra in ucontext
1276    address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1277    return os::is_memory_serialize_page(thread, ra_val+d1);
1278  } else {
1279    return false;
1280  }
1281#else
1282  // workaround not needed on !LINUX :-)
1283  ShouldNotCallThis();
1284  return false;
1285#endif
1286}
1287
1288void MacroAssembler::bang_stack_with_offset(int offset) {
1289  // When increasing the stack, the old stack pointer will be written
1290  // to the new top of stack according to the PPC64 abi.
1291  // Therefore, stack banging is not necessary when increasing
1292  // the stack by <= os::vm_page_size() bytes.
1293  // When increasing the stack by a larger amount, this method is
1294  // called repeatedly to bang the intermediate pages.
1295
1296  // Stack grows down, caller passes positive offset.
1297  assert(offset > 0, "must bang with positive offset");
1298
1299  long stdoffset = -offset;
1300
1301  if (is_simm(stdoffset, 16)) {
1302    // Signed 16 bit offset, a simple std is ok.
1303    if (UseLoadInstructionsForStackBangingPPC64) {
1304      ld(R0, (int)(signed short)stdoffset, R1_SP);
1305    } else {
1306      std(R0,(int)(signed short)stdoffset, R1_SP);
1307    }
1308  } else if (is_simm(stdoffset, 31)) {
1309    const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1310    const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1311
1312    Register tmp = R11;
1313    addis(tmp, R1_SP, hi);
1314    if (UseLoadInstructionsForStackBangingPPC64) {
1315      ld(R0,  lo, tmp);
1316    } else {
1317      std(R0, lo, tmp);
1318    }
1319  } else {
1320    ShouldNotReachHere();
1321  }
1322}
1323
1324// If instruction is a stack bang of the form
1325//    std    R0,    x(Ry),       (see bang_stack_with_offset())
1326//    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1327// or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1328// return the banged address. Otherwise, return 0.
1329address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1330#ifdef LINUX
1331  ucontext_t* uc = (ucontext_t*) ucontext;
1332  int rs = inv_rs_field(instruction);
1333  int ra = inv_ra_field(instruction);
1334  if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1335      || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1336      || (is_stdu(instruction) && rs == 1)) {
1337    int ds = inv_ds_field(instruction);
1338    // return banged address
1339    return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1340  } else if (is_stdux(instruction) && rs == 1) {
1341    int rb = inv_rb_field(instruction);
1342    address sp = (address)uc->uc_mcontext.regs->gpr[1];
1343    long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1344    return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1345                                  : sp + rb_val; // banged address
1346  }
1347  return NULL; // not a stack bang
1348#else
1349  // workaround not needed on !LINUX :-)
1350  ShouldNotCallThis();
1351  return NULL;
1352#endif
1353}
1354
1355// CmpxchgX sets condition register to cmpX(current, compare).
1356void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value,
1357                              Register compare_value, Register exchange_value,
1358                              Register addr_base, int semantics, bool cmpxchgx_hint,
1359                              Register int_flag_success, bool contention_hint) {
1360  Label retry;
1361  Label failed;
1362  Label done;
1363
1364  // Save one branch if result is returned via register and
1365  // result register is different from the other ones.
1366  bool use_result_reg    = (int_flag_success != noreg);
1367  bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1368                            int_flag_success != exchange_value && int_flag_success != addr_base);
1369
1370  // release/fence semantics
1371  if (semantics & MemBarRel) {
1372    release();
1373  }
1374
1375  if (use_result_reg && preset_result_reg) {
1376    li(int_flag_success, 0); // preset (assume cas failed)
1377  }
1378
1379  // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1380  if (contention_hint) { // Don't try to reserve if cmp fails.
1381    lwz(dest_current_value, 0, addr_base);
1382    cmpw(flag, dest_current_value, compare_value);
1383    bne(flag, failed);
1384  }
1385
1386  // atomic emulation loop
1387  bind(retry);
1388
1389  lwarx(dest_current_value, addr_base, cmpxchgx_hint);
1390  cmpw(flag, dest_current_value, compare_value);
1391  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1392    bne_predict_not_taken(flag, failed);
1393  } else {
1394    bne(                  flag, failed);
1395  }
1396  // branch to done  => (flag == ne), (dest_current_value != compare_value)
1397  // fall through    => (flag == eq), (dest_current_value == compare_value)
1398
1399  stwcx_(exchange_value, addr_base);
1400  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1401    bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1402  } else {
1403    bne(                  CCR0, retry); // StXcx_ sets CCR0.
1404  }
1405  // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1406
1407  // Result in register (must do this at the end because int_flag_success can be the
1408  // same register as one above).
1409  if (use_result_reg) {
1410    li(int_flag_success, 1);
1411  }
1412
1413  if (semantics & MemBarFenceAfter) {
1414    fence();
1415  } else if (semantics & MemBarAcq) {
1416    isync();
1417  }
1418
1419  if (use_result_reg && !preset_result_reg) {
1420    b(done);
1421  }
1422
1423  bind(failed);
1424  if (use_result_reg && !preset_result_reg) {
1425    li(int_flag_success, 0);
1426  }
1427
1428  bind(done);
1429  // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1430  // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1431}
1432
1433// Preforms atomic compare exchange:
1434//   if (compare_value == *addr_base)
1435//     *addr_base = exchange_value
1436//     int_flag_success = 1;
1437//   else
1438//     int_flag_success = 0;
1439//
1440// ConditionRegister flag       = cmp(compare_value, *addr_base)
1441// Register dest_current_value  = *addr_base
1442// Register compare_value       Used to compare with value in memory
1443// Register exchange_value      Written to memory if compare_value == *addr_base
1444// Register addr_base           The memory location to compareXChange
1445// Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1446//
1447// To avoid the costly compare exchange the value is tested beforehand.
1448// Several special cases exist to avoid that unnecessary information is generated.
1449//
1450void MacroAssembler::cmpxchgd(ConditionRegister flag,
1451                              Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1452                              Register addr_base, int semantics, bool cmpxchgx_hint,
1453                              Register int_flag_success, Label* failed_ext, bool contention_hint) {
1454  Label retry;
1455  Label failed_int;
1456  Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1457  Label done;
1458
1459  // Save one branch if result is returned via register and result register is different from the other ones.
1460  bool use_result_reg    = (int_flag_success!=noreg);
1461  bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1462                            int_flag_success!=exchange_value && int_flag_success!=addr_base);
1463  assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1464
1465  // release/fence semantics
1466  if (semantics & MemBarRel) {
1467    release();
1468  }
1469
1470  if (use_result_reg && preset_result_reg) {
1471    li(int_flag_success, 0); // preset (assume cas failed)
1472  }
1473
1474  // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1475  if (contention_hint) { // Don't try to reserve if cmp fails.
1476    ld(dest_current_value, 0, addr_base);
1477    cmpd(flag, compare_value, dest_current_value);
1478    bne(flag, failed);
1479  }
1480
1481  // atomic emulation loop
1482  bind(retry);
1483
1484  ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1485  cmpd(flag, compare_value, dest_current_value);
1486  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1487    bne_predict_not_taken(flag, failed);
1488  } else {
1489    bne(                  flag, failed);
1490  }
1491
1492  stdcx_(exchange_value, addr_base);
1493  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1494    bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
1495  } else {
1496    bne(                  CCR0, retry); // stXcx_ sets CCR0
1497  }
1498
1499  // result in register (must do this at the end because int_flag_success can be the same register as one above)
1500  if (use_result_reg) {
1501    li(int_flag_success, 1);
1502  }
1503
1504  // POWER6 doesn't need isync in CAS.
1505  // Always emit isync to be on the safe side.
1506  if (semantics & MemBarFenceAfter) {
1507    fence();
1508  } else if (semantics & MemBarAcq) {
1509    isync();
1510  }
1511
1512  if (use_result_reg && !preset_result_reg) {
1513    b(done);
1514  }
1515
1516  bind(failed_int);
1517  if (use_result_reg && !preset_result_reg) {
1518    li(int_flag_success, 0);
1519  }
1520
1521  bind(done);
1522  // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1523  // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1524}
1525
1526// Look up the method for a megamorphic invokeinterface call.
1527// The target method is determined by <intf_klass, itable_index>.
1528// The receiver klass is in recv_klass.
1529// On success, the result will be in method_result, and execution falls through.
1530// On failure, execution transfers to the given label.
1531void MacroAssembler::lookup_interface_method(Register recv_klass,
1532                                             Register intf_klass,
1533                                             RegisterOrConstant itable_index,
1534                                             Register method_result,
1535                                             Register scan_temp,
1536                                             Register sethi_temp,
1537                                             Label& L_no_such_interface) {
1538  assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1539  assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1540         "caller must use same register for non-constant itable index as for method");
1541
1542  // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1543  int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
1544  int itentry_off = itableMethodEntry::method_offset_in_bytes();
1545  int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1546  int scan_step   = itableOffsetEntry::size() * wordSize;
1547  int log_vte_size= exact_log2(vtableEntry::size() * wordSize);
1548
1549  lwz(scan_temp, InstanceKlass::vtable_length_offset() * wordSize, recv_klass);
1550  // %%% We should store the aligned, prescaled offset in the klassoop.
1551  // Then the next several instructions would fold away.
1552
1553  sldi(scan_temp, scan_temp, log_vte_size);
1554  addi(scan_temp, scan_temp, vtable_base);
1555  add(scan_temp, recv_klass, scan_temp);
1556
1557  // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1558  if (itable_index.is_register()) {
1559    Register itable_offset = itable_index.as_register();
1560    sldi(itable_offset, itable_offset, logMEsize);
1561    if (itentry_off) addi(itable_offset, itable_offset, itentry_off);
1562    add(recv_klass, itable_offset, recv_klass);
1563  } else {
1564    long itable_offset = (long)itable_index.as_constant();
1565    load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation
1566    add(recv_klass, sethi_temp, recv_klass);
1567  }
1568
1569  // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1570  //   if (scan->interface() == intf) {
1571  //     result = (klass + scan->offset() + itable_index);
1572  //   }
1573  // }
1574  Label search, found_method;
1575
1576  for (int peel = 1; peel >= 0; peel--) {
1577    // %%%% Could load both offset and interface in one ldx, if they were
1578    // in the opposite order. This would save a load.
1579    ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1580
1581    // Check that this entry is non-null. A null entry means that
1582    // the receiver class doesn't implement the interface, and wasn't the
1583    // same as when the caller was compiled.
1584    cmpd(CCR0, method_result, intf_klass);
1585
1586    if (peel) {
1587      beq(CCR0, found_method);
1588    } else {
1589      bne(CCR0, search);
1590      // (invert the test to fall through to found_method...)
1591    }
1592
1593    if (!peel) break;
1594
1595    bind(search);
1596
1597    cmpdi(CCR0, method_result, 0);
1598    beq(CCR0, L_no_such_interface);
1599    addi(scan_temp, scan_temp, scan_step);
1600  }
1601
1602  bind(found_method);
1603
1604  // Got a hit.
1605  int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1606  lwz(scan_temp, ito_offset, scan_temp);
1607  ldx(method_result, scan_temp, recv_klass);
1608}
1609
1610// virtual method calling
1611void MacroAssembler::lookup_virtual_method(Register recv_klass,
1612                                           RegisterOrConstant vtable_index,
1613                                           Register method_result) {
1614
1615  assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1616
1617  const int base = InstanceKlass::vtable_start_offset() * wordSize;
1618  assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1619
1620  if (vtable_index.is_register()) {
1621    sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1622    add(recv_klass, vtable_index.as_register(), recv_klass);
1623  } else {
1624    addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1625  }
1626  ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1627}
1628
1629/////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1630
1631void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1632                                                   Register super_klass,
1633                                                   Register temp1_reg,
1634                                                   Register temp2_reg,
1635                                                   Label& L_success,
1636                                                   Label& L_failure) {
1637
1638  const Register check_cache_offset = temp1_reg;
1639  const Register cached_super       = temp2_reg;
1640
1641  assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1642
1643  int sco_offset = in_bytes(Klass::super_check_offset_offset());
1644  int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1645
1646  // If the pointers are equal, we are done (e.g., String[] elements).
1647  // This self-check enables sharing of secondary supertype arrays among
1648  // non-primary types such as array-of-interface. Otherwise, each such
1649  // type would need its own customized SSA.
1650  // We move this check to the front of the fast path because many
1651  // type checks are in fact trivially successful in this manner,
1652  // so we get a nicely predicted branch right at the start of the check.
1653  cmpd(CCR0, sub_klass, super_klass);
1654  beq(CCR0, L_success);
1655
1656  // Check the supertype display:
1657  lwz(check_cache_offset, sco_offset, super_klass);
1658  // The loaded value is the offset from KlassOopDesc.
1659
1660  ldx(cached_super, check_cache_offset, sub_klass);
1661  cmpd(CCR0, cached_super, super_klass);
1662  beq(CCR0, L_success);
1663
1664  // This check has worked decisively for primary supers.
1665  // Secondary supers are sought in the super_cache ('super_cache_addr').
1666  // (Secondary supers are interfaces and very deeply nested subtypes.)
1667  // This works in the same check above because of a tricky aliasing
1668  // between the super_cache and the primary super display elements.
1669  // (The 'super_check_addr' can address either, as the case requires.)
1670  // Note that the cache is updated below if it does not help us find
1671  // what we need immediately.
1672  // So if it was a primary super, we can just fail immediately.
1673  // Otherwise, it's the slow path for us (no success at this point).
1674
1675  cmpwi(CCR0, check_cache_offset, sc_offset);
1676  bne(CCR0, L_failure);
1677  // bind(slow_path); // fallthru
1678}
1679
1680void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1681                                                   Register super_klass,
1682                                                   Register temp1_reg,
1683                                                   Register temp2_reg,
1684                                                   Label* L_success,
1685                                                   Register result_reg) {
1686  const Register array_ptr = temp1_reg; // current value from cache array
1687  const Register temp      = temp2_reg;
1688
1689  assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1690
1691  int source_offset = in_bytes(Klass::secondary_supers_offset());
1692  int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1693
1694  int length_offset = Array<Klass*>::length_offset_in_bytes();
1695  int base_offset   = Array<Klass*>::base_offset_in_bytes();
1696
1697  Label hit, loop, failure, fallthru;
1698
1699  ld(array_ptr, source_offset, sub_klass);
1700
1701  //assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1702  lwz(temp, length_offset, array_ptr);
1703  cmpwi(CCR0, temp, 0);
1704  beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1705
1706  mtctr(temp); // load ctr
1707
1708  bind(loop);
1709  // Oops in table are NO MORE compressed.
1710  ld(temp, base_offset, array_ptr);
1711  cmpd(CCR0, temp, super_klass);
1712  beq(CCR0, hit);
1713  addi(array_ptr, array_ptr, BytesPerWord);
1714  bdnz(loop);
1715
1716  bind(failure);
1717  if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
1718  b(fallthru);
1719
1720  bind(hit);
1721  std(super_klass, target_offset, sub_klass); // save result to cache
1722  if (result_reg != noreg) li(result_reg, 0); // load zero result (indicates a hit)
1723  if (L_success != NULL) b(*L_success);
1724
1725  bind(fallthru);
1726}
1727
1728// Try fast path, then go to slow one if not successful
1729void MacroAssembler::check_klass_subtype(Register sub_klass,
1730                         Register super_klass,
1731                         Register temp1_reg,
1732                         Register temp2_reg,
1733                         Label& L_success) {
1734  Label L_failure;
1735  check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, L_failure);
1736  check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
1737  bind(L_failure); // Fallthru if not successful.
1738}
1739
1740void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
1741                                              Register temp_reg,
1742                                              Label& wrong_method_type) {
1743  assert_different_registers(mtype_reg, mh_reg, temp_reg);
1744  // Compare method type against that of the receiver.
1745  load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
1746  cmpd(CCR0, temp_reg, mtype_reg);
1747  bne(CCR0, wrong_method_type);
1748}
1749
1750RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
1751                                                   Register temp_reg,
1752                                                   int extra_slot_offset) {
1753  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1754  int stackElementSize = Interpreter::stackElementSize;
1755  int offset = extra_slot_offset * stackElementSize;
1756  if (arg_slot.is_constant()) {
1757    offset += arg_slot.as_constant() * stackElementSize;
1758    return offset;
1759  } else {
1760    assert(temp_reg != noreg, "must specify");
1761    sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
1762    if (offset != 0)
1763      addi(temp_reg, temp_reg, offset);
1764    return temp_reg;
1765  }
1766}
1767
1768void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
1769                                          Register mark_reg, Register temp_reg,
1770                                          Register temp2_reg, Label& done, Label* slow_case) {
1771  assert(UseBiasedLocking, "why call this otherwise?");
1772
1773#ifdef ASSERT
1774  assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
1775#endif
1776
1777  Label cas_label;
1778
1779  // Branch to done if fast path fails and no slow_case provided.
1780  Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
1781
1782  // Biased locking
1783  // See whether the lock is currently biased toward our thread and
1784  // whether the epoch is still valid
1785  // Note that the runtime guarantees sufficient alignment of JavaThread
1786  // pointers to allow age to be placed into low bits
1787  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
1788         "biased locking makes assumptions about bit layout");
1789
1790  if (PrintBiasedLockingStatistics) {
1791    load_const(temp_reg, (address) BiasedLocking::total_entry_count_addr(), temp2_reg);
1792    lwz(temp2_reg, 0, temp_reg);
1793    addi(temp2_reg, temp2_reg, 1);
1794    stw(temp2_reg, 0, temp_reg);
1795  }
1796
1797  andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
1798  cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1799  bne(cr_reg, cas_label);
1800
1801  load_klass(temp_reg, obj_reg);
1802
1803  load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
1804  ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1805  orr(temp_reg, R16_thread, temp_reg);
1806  xorr(temp_reg, mark_reg, temp_reg);
1807  andr(temp_reg, temp_reg, temp2_reg);
1808  cmpdi(cr_reg, temp_reg, 0);
1809  if (PrintBiasedLockingStatistics) {
1810    Label l;
1811    bne(cr_reg, l);
1812    load_const(mark_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
1813    lwz(temp2_reg, 0, mark_reg);
1814    addi(temp2_reg, temp2_reg, 1);
1815    stw(temp2_reg, 0, mark_reg);
1816    // restore mark_reg
1817    ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1818    bind(l);
1819  }
1820  beq(cr_reg, done);
1821
1822  Label try_revoke_bias;
1823  Label try_rebias;
1824
1825  // At this point we know that the header has the bias pattern and
1826  // that we are not the bias owner in the current epoch. We need to
1827  // figure out more details about the state of the header in order to
1828  // know what operations can be legally performed on the object's
1829  // header.
1830
1831  // If the low three bits in the xor result aren't clear, that means
1832  // the prototype header is no longer biased and we have to revoke
1833  // the bias on this object.
1834  andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1835  cmpwi(cr_reg, temp2_reg, 0);
1836  bne(cr_reg, try_revoke_bias);
1837
1838  // Biasing is still enabled for this data type. See whether the
1839  // epoch of the current bias is still valid, meaning that the epoch
1840  // bits of the mark word are equal to the epoch bits of the
1841  // prototype header. (Note that the prototype header's epoch bits
1842  // only change at a safepoint.) If not, attempt to rebias the object
1843  // toward the current thread. Note that we must be absolutely sure
1844  // that the current epoch is invalid in order to do this because
1845  // otherwise the manipulations it performs on the mark word are
1846  // illegal.
1847
1848  int shift_amount = 64 - markOopDesc::epoch_shift;
1849  // rotate epoch bits to right (little) end and set other bits to 0
1850  // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
1851  rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
1852  // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
1853  bne(CCR0, try_rebias);
1854
1855  // The epoch of the current bias is still valid but we know nothing
1856  // about the owner; it might be set or it might be clear. Try to
1857  // acquire the bias of the object using an atomic operation. If this
1858  // fails we will go in to the runtime to revoke the object's bias.
1859  // Note that we first construct the presumed unbiased header so we
1860  // don't accidentally blow away another thread's valid bias.
1861  andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
1862                                markOopDesc::age_mask_in_place |
1863                                markOopDesc::epoch_mask_in_place));
1864  orr(temp_reg, R16_thread, mark_reg);
1865
1866  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1867
1868  // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1869  cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1870           /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1871           /*where=*/obj_reg,
1872           MacroAssembler::MemBarAcq,
1873           MacroAssembler::cmpxchgx_hint_acquire_lock(),
1874           noreg, slow_case_int); // bail out if failed
1875
1876  // If the biasing toward our thread failed, this means that
1877  // another thread succeeded in biasing it toward itself and we
1878  // need to revoke that bias. The revocation will occur in the
1879  // interpreter runtime in the slow case.
1880  if (PrintBiasedLockingStatistics) {
1881    load_const(temp_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp2_reg);
1882    lwz(temp2_reg, 0, temp_reg);
1883    addi(temp2_reg, temp2_reg, 1);
1884    stw(temp2_reg, 0, temp_reg);
1885  }
1886  b(done);
1887
1888  bind(try_rebias);
1889  // At this point we know the epoch has expired, meaning that the
1890  // current "bias owner", if any, is actually invalid. Under these
1891  // circumstances _only_, we are allowed to use the current header's
1892  // value as the comparison value when doing the cas to acquire the
1893  // bias in the current epoch. In other words, we allow transfer of
1894  // the bias from one thread to another directly in this situation.
1895  andi(temp_reg, mark_reg, markOopDesc::age_mask_in_place);
1896  orr(temp_reg, R16_thread, temp_reg);
1897  load_klass(temp2_reg, obj_reg);
1898  ld(temp2_reg, in_bytes(Klass::prototype_header_offset()), temp2_reg);
1899  orr(temp_reg, temp_reg, temp2_reg);
1900
1901  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1902
1903  // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1904  cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1905                 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1906                 /*where=*/obj_reg,
1907                 MacroAssembler::MemBarAcq,
1908                 MacroAssembler::cmpxchgx_hint_acquire_lock(),
1909                 noreg, slow_case_int); // bail out if failed
1910
1911  // If the biasing toward our thread failed, this means that
1912  // another thread succeeded in biasing it toward itself and we
1913  // need to revoke that bias. The revocation will occur in the
1914  // interpreter runtime in the slow case.
1915  if (PrintBiasedLockingStatistics) {
1916    load_const(temp_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp2_reg);
1917    lwz(temp2_reg, 0, temp_reg);
1918    addi(temp2_reg, temp2_reg, 1);
1919    stw(temp2_reg, 0, temp_reg);
1920  }
1921  b(done);
1922
1923  bind(try_revoke_bias);
1924  // The prototype mark in the klass doesn't have the bias bit set any
1925  // more, indicating that objects of this data type are not supposed
1926  // to be biased any more. We are going to try to reset the mark of
1927  // this object to the prototype value and fall through to the
1928  // CAS-based locking scheme. Note that if our CAS fails, it means
1929  // that another thread raced us for the privilege of revoking the
1930  // bias of this particular object, so it's okay to continue in the
1931  // normal locking code.
1932  load_klass(temp_reg, obj_reg);
1933  ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1934  andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
1935  orr(temp_reg, temp_reg, temp2_reg);
1936
1937  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1938
1939  // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1940  cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1941                 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1942                 /*where=*/obj_reg,
1943                 MacroAssembler::MemBarAcq,
1944                 MacroAssembler::cmpxchgx_hint_acquire_lock());
1945
1946  // reload markOop in mark_reg before continuing with lightweight locking
1947  ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1948
1949  // Fall through to the normal CAS-based lock, because no matter what
1950  // the result of the above CAS, some thread must have succeeded in
1951  // removing the bias bit from the object's header.
1952  if (PrintBiasedLockingStatistics) {
1953    Label l;
1954    bne(cr_reg, l);
1955    load_const(temp_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp2_reg);
1956    lwz(temp2_reg, 0, temp_reg);
1957    addi(temp2_reg, temp2_reg, 1);
1958    stw(temp2_reg, 0, temp_reg);
1959    bind(l);
1960  }
1961
1962  bind(cas_label);
1963}
1964
1965void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
1966  // Check for biased locking unlock case, which is a no-op
1967  // Note: we do not have to check the thread ID for two reasons.
1968  // First, the interpreter checks for IllegalMonitorStateException at
1969  // a higher level. Second, if the bias was revoked while we held the
1970  // lock, the object could not be rebiased toward another thread, so
1971  // the bias bit would be clear.
1972
1973  ld(temp_reg, 0, mark_addr);
1974  andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1975
1976  cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1977  beq(cr_reg, done);
1978}
1979
1980// TM on PPC64.
1981void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
1982  Label retry;
1983  bind(retry);
1984  ldarx(result, addr, /*hint*/ false);
1985  addi(result, result, simm16);
1986  stdcx_(result, addr);
1987  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1988    bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
1989  } else {
1990    bne(                  CCR0, retry); // stXcx_ sets CCR0
1991  }
1992}
1993
1994void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
1995  Label retry;
1996  bind(retry);
1997  lwarx(result, addr, /*hint*/ false);
1998  ori(result, result, uimm16);
1999  stwcx_(result, addr);
2000  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2001    bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2002  } else {
2003    bne(                  CCR0, retry); // stXcx_ sets CCR0
2004  }
2005}
2006
2007#if INCLUDE_RTM_OPT
2008
2009// Update rtm_counters based on abort status
2010// input: abort_status
2011//        rtm_counters (RTMLockingCounters*)
2012void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2013  // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2014  // x86 ppc (! means inverted, ? means not the same)
2015  //  0   31  Set if abort caused by XABORT instruction.
2016  //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2017  //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2018  //  3   10  Set if an internal buffer overflowed.
2019  //  4  ?12  Set if a debug breakpoint was hit.
2020  //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2021  const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2022                                 Assembler::tm_failure_persistent, // inverted: transient
2023                                 Assembler::tm_trans_cf,
2024                                 Assembler::tm_footprint_of,
2025                                 Assembler::tm_non_trans_cf,
2026                                 Assembler::tm_suspended};
2027  const bool tm_failure_inv[] = {false, true, false, false, false, false};
2028  assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2029
2030  const Register addr_Reg = R0;
2031  // Keep track of offset to where rtm_counters_Reg had pointed to.
2032  int counters_offs = RTMLockingCounters::abort_count_offset();
2033  addi(addr_Reg, rtm_counters_Reg, counters_offs);
2034  const Register temp_Reg = rtm_counters_Reg;
2035
2036  //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2037  ldx(temp_Reg, addr_Reg);
2038  addi(temp_Reg, temp_Reg, 1);
2039  stdx(temp_Reg, addr_Reg);
2040
2041  if (PrintPreciseRTMLockingStatistics) {
2042    int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2043
2044    //mftexasr(abort_status); done by caller
2045    for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2046      counters_offs += counters_offs_delta;
2047      li(temp_Reg, counters_offs_delta); // can't use addi with R0
2048      add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2049      counters_offs_delta = sizeof(uintx);
2050
2051      Label check_abort;
2052      rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2053      if (tm_failure_inv[i]) {
2054        bne(CCR0, check_abort);
2055      } else {
2056        beq(CCR0, check_abort);
2057      }
2058      //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2059      ldx(temp_Reg, addr_Reg);
2060      addi(temp_Reg, temp_Reg, 1);
2061      stdx(temp_Reg, addr_Reg);
2062      bind(check_abort);
2063    }
2064  }
2065  li(temp_Reg, -counters_offs); // can't use addi with R0
2066  add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2067}
2068
2069// Branch if (random & (count-1) != 0), count is 2^n
2070// tmp and CR0 are killed
2071void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2072  mftb(tmp);
2073  andi_(tmp, tmp, count-1);
2074  bne(CCR0, brLabel);
2075}
2076
2077// Perform abort ratio calculation, set no_rtm bit if high ratio.
2078// input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2079void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2080                                                 RTMLockingCounters* rtm_counters,
2081                                                 Metadata* method_data) {
2082  Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2083
2084  if (RTMLockingCalculationDelay > 0) {
2085    // Delay calculation.
2086    ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2087    cmpdi(CCR0, rtm_counters_Reg, 0);
2088    beq(CCR0, L_done);
2089    load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2090  }
2091  // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2092  //   Aborted transactions = abort_count * 100
2093  //   All transactions = total_count *  RTMTotalCountIncrRate
2094  //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2095  ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2096  cmpdi(CCR0, R0, RTMAbortThreshold);
2097  blt(CCR0, L_check_always_rtm2);
2098  mulli(R0, R0, 100);
2099
2100  const Register tmpReg = rtm_counters_Reg;
2101  ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2102  mulli(tmpReg, tmpReg, RTMTotalCountIncrRate);
2103  mulli(tmpReg, tmpReg, RTMAbortRatio);
2104  cmpd(CCR0, R0, tmpReg);
2105  blt(CCR0, L_check_always_rtm1); // jump to reload
2106  if (method_data != NULL) {
2107    // Set rtm_state to "no rtm" in MDO.
2108    // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2109    // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2110    load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2111    atomic_ori_int(R0, tmpReg, NoRTM);
2112  }
2113  b(L_done);
2114
2115  bind(L_check_always_rtm1);
2116  load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2117  bind(L_check_always_rtm2);
2118  ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2119  cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
2120  blt(CCR0, L_done);
2121  if (method_data != NULL) {
2122    // Set rtm_state to "always rtm" in MDO.
2123    // Not using a metadata relocation. See above.
2124    load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2125    atomic_ori_int(R0, tmpReg, UseRTM);
2126  }
2127  bind(L_done);
2128}
2129
2130// Update counters and perform abort ratio calculation.
2131// input: abort_status_Reg
2132void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2133                                   RTMLockingCounters* rtm_counters,
2134                                   Metadata* method_data,
2135                                   bool profile_rtm) {
2136
2137  assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2138  // Update rtm counters based on state at abort.
2139  // Reads abort_status_Reg, updates flags.
2140  assert_different_registers(abort_status_Reg, temp_Reg);
2141  load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2142  rtm_counters_update(abort_status_Reg, temp_Reg);
2143  if (profile_rtm) {
2144    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2145    rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2146  }
2147}
2148
2149// Retry on abort if abort's status indicates non-persistent failure.
2150// inputs: retry_count_Reg
2151//       : abort_status_Reg
2152// output: retry_count_Reg decremented by 1
2153void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2154                                             Label& retryLabel, Label* checkRetry) {
2155  Label doneRetry;
2156  rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2157  bne(CCR0, doneRetry);
2158  if (checkRetry) { bind(*checkRetry); }
2159  addic_(retry_count_Reg, retry_count_Reg, -1);
2160  blt(CCR0, doneRetry);
2161  smt_yield(); // Can't use wait(). No permission (SIGILL).
2162  b(retryLabel);
2163  bind(doneRetry);
2164}
2165
2166// Spin and retry if lock is busy.
2167// inputs: box_Reg (monitor address)
2168//       : retry_count_Reg
2169// output: retry_count_Reg decremented by 1
2170// CTR is killed
2171void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2172  Label SpinLoop, doneRetry;
2173  addic_(retry_count_Reg, retry_count_Reg, -1);
2174  blt(CCR0, doneRetry);
2175  li(R0, RTMSpinLoopCount);
2176  mtctr(R0);
2177
2178  bind(SpinLoop);
2179  smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2180  bdz(retryLabel);
2181  ld(R0, 0, owner_addr_Reg);
2182  cmpdi(CCR0, R0, 0);
2183  bne(CCR0, SpinLoop);
2184  b(retryLabel);
2185
2186  bind(doneRetry);
2187}
2188
2189// Use RTM for normal stack locks.
2190// Input: objReg (object to lock)
2191void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2192                                       Register obj, Register mark_word, Register tmp,
2193                                       Register retry_on_abort_count_Reg,
2194                                       RTMLockingCounters* stack_rtm_counters,
2195                                       Metadata* method_data, bool profile_rtm,
2196                                       Label& DONE_LABEL, Label& IsInflated) {
2197  assert(UseRTMForStackLocks, "why call this otherwise?");
2198  assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2199  Label L_rtm_retry, L_decrement_retry, L_on_abort;
2200
2201  if (RTMRetryCount > 0) {
2202    load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2203    bind(L_rtm_retry);
2204  }
2205  andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2206  bne(CCR0, IsInflated);
2207
2208  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2209    Label L_noincrement;
2210    if (RTMTotalCountIncrRate > 1) {
2211      branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement);
2212    }
2213    assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2214    load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2215    //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2216    ldx(mark_word, tmp);
2217    addi(mark_word, mark_word, 1);
2218    stdx(mark_word, tmp);
2219    bind(L_noincrement);
2220  }
2221  tbegin_();
2222  beq(CCR0, L_on_abort);
2223  ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2224  andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2225  cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2226  beq(flag, DONE_LABEL);                                       // all done if unlocked
2227
2228  if (UseRTMXendForLockBusy) {
2229    tend_();
2230    b(L_decrement_retry);
2231  } else {
2232    tabort_();
2233  }
2234  bind(L_on_abort);
2235  const Register abort_status_Reg = tmp;
2236  mftexasr(abort_status_Reg);
2237  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2238    rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2239  }
2240  ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2241  if (RTMRetryCount > 0) {
2242    // Retry on lock abort if abort status is not permanent.
2243    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2244  } else {
2245    bind(L_decrement_retry);
2246  }
2247}
2248
2249// Use RTM for inflating locks
2250// inputs: obj       (object to lock)
2251//         mark_word (current header - KILLED)
2252//         boxReg    (on-stack box address (displaced header location) - KILLED)
2253void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2254                                          Register obj, Register mark_word, Register boxReg,
2255                                          Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2256                                          RTMLockingCounters* rtm_counters,
2257                                          Metadata* method_data, bool profile_rtm,
2258                                          Label& DONE_LABEL) {
2259  assert(UseRTMLocking, "why call this otherwise?");
2260  Label L_rtm_retry, L_decrement_retry, L_on_abort;
2261  // Clean monitor_value bit to get valid pointer.
2262  int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2263
2264  // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2265  std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2266  const Register tmpReg = boxReg;
2267  const Register owner_addr_Reg = mark_word;
2268  addi(owner_addr_Reg, mark_word, owner_offset);
2269
2270  if (RTMRetryCount > 0) {
2271    load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2272    load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2273    bind(L_rtm_retry);
2274  }
2275  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2276    Label L_noincrement;
2277    if (RTMTotalCountIncrRate > 1) {
2278      branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement);
2279    }
2280    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2281    load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2282    //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2283    ldx(tmpReg, R0);
2284    addi(tmpReg, tmpReg, 1);
2285    stdx(tmpReg, R0);
2286    bind(L_noincrement);
2287  }
2288  tbegin_();
2289  beq(CCR0, L_on_abort);
2290  // We don't reload mark word. Will only be reset at safepoint.
2291  ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2292  cmpdi(flag, R0, 0);
2293  beq(flag, DONE_LABEL);
2294
2295  if (UseRTMXendForLockBusy) {
2296    tend_();
2297    b(L_decrement_retry);
2298  } else {
2299    tabort_();
2300  }
2301  bind(L_on_abort);
2302  const Register abort_status_Reg = tmpReg;
2303  mftexasr(abort_status_Reg);
2304  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2305    rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2306    // Restore owner_addr_Reg
2307    ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2308#ifdef ASSERT
2309    andi_(R0, mark_word, markOopDesc::monitor_value);
2310    asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2311#endif
2312    addi(owner_addr_Reg, mark_word, owner_offset);
2313  }
2314  if (RTMRetryCount > 0) {
2315    // Retry on lock abort if abort status is not permanent.
2316    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2317  }
2318
2319  // Appears unlocked - try to swing _owner from null to non-null.
2320  cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2321           MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2322           MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2323
2324  if (RTMRetryCount > 0) {
2325    // success done else retry
2326    b(DONE_LABEL);
2327    bind(L_decrement_retry);
2328    // Spin and retry if lock is busy.
2329    rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2330  } else {
2331    bind(L_decrement_retry);
2332  }
2333}
2334
2335#endif //  INCLUDE_RTM_OPT
2336
2337// "The box" is the space on the stack where we copy the object mark.
2338void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2339                                               Register temp, Register displaced_header, Register current_header,
2340                                               bool try_bias,
2341                                               RTMLockingCounters* rtm_counters,
2342                                               RTMLockingCounters* stack_rtm_counters,
2343                                               Metadata* method_data,
2344                                               bool use_rtm, bool profile_rtm) {
2345  assert_different_registers(oop, box, temp, displaced_header, current_header);
2346  assert(flag != CCR0, "bad condition register");
2347  Label cont;
2348  Label object_has_monitor;
2349  Label cas_failed;
2350
2351  // Load markOop from object into displaced_header.
2352  ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2353
2354
2355  // Always do locking in runtime.
2356  if (EmitSync & 0x01) {
2357    cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2358    return;
2359  }
2360
2361  if (try_bias) {
2362    biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2363  }
2364
2365#if INCLUDE_RTM_OPT
2366  if (UseRTMForStackLocks && use_rtm) {
2367    rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2368                      stack_rtm_counters, method_data, profile_rtm,
2369                      cont, object_has_monitor);
2370  }
2371#endif // INCLUDE_RTM_OPT
2372
2373  // Handle existing monitor.
2374  if ((EmitSync & 0x02) == 0) {
2375    // The object has an existing monitor iff (mark & monitor_value) != 0.
2376    andi_(temp, displaced_header, markOopDesc::monitor_value);
2377    bne(CCR0, object_has_monitor);
2378  }
2379
2380  // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2381  ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2382
2383  // Load Compare Value application register.
2384
2385  // Initialize the box. (Must happen before we update the object mark!)
2386  std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2387
2388  // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2389  // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2390  // CmpxchgX sets cr_reg to cmpX(current, displaced).
2391  membar(Assembler::StoreStore);
2392  cmpxchgd(/*flag=*/flag,
2393           /*current_value=*/current_header,
2394           /*compare_value=*/displaced_header,
2395           /*exchange_value=*/box,
2396           /*where=*/oop,
2397           MacroAssembler::MemBarAcq,
2398           MacroAssembler::cmpxchgx_hint_acquire_lock(),
2399           noreg,
2400           &cas_failed);
2401  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2402
2403  // If the compare-and-exchange succeeded, then we found an unlocked
2404  // object and we have now locked it.
2405  b(cont);
2406
2407  bind(cas_failed);
2408  // We did not see an unlocked object so try the fast recursive case.
2409
2410  // Check if the owner is self by comparing the value in the markOop of object
2411  // (current_header) with the stack pointer.
2412  sub(current_header, current_header, R1_SP);
2413  load_const_optimized(temp, (address) (~(os::vm_page_size()-1) |
2414                                        markOopDesc::lock_mask_in_place));
2415
2416  and_(R0/*==0?*/, current_header, temp);
2417  // If condition is true we are cont and hence we can store 0 as the
2418  // displaced header in the box, which indicates that it is a recursive lock.
2419  mcrf(flag,CCR0);
2420  std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2421
2422  // Handle existing monitor.
2423  if ((EmitSync & 0x02) == 0) {
2424    b(cont);
2425
2426    bind(object_has_monitor);
2427    // The object's monitor m is unlocked iff m->owner == NULL,
2428    // otherwise m->owner may contain a thread or a stack address.
2429
2430#if INCLUDE_RTM_OPT
2431    // Use the same RTM locking code in 32- and 64-bit VM.
2432    if (use_rtm) {
2433      rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2434                           rtm_counters, method_data, profile_rtm, cont);
2435    } else {
2436#endif // INCLUDE_RTM_OPT
2437
2438    // Try to CAS m->owner from NULL to current thread.
2439    addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2440    li(displaced_header, 0);
2441    // CmpxchgX sets flag to cmpX(current, displaced).
2442    cmpxchgd(/*flag=*/flag,
2443             /*current_value=*/current_header,
2444             /*compare_value=*/(intptr_t)0,
2445             /*exchange_value=*/R16_thread,
2446             /*where=*/temp,
2447             MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2448             MacroAssembler::cmpxchgx_hint_acquire_lock());
2449
2450    // Store a non-null value into the box.
2451    std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2452
2453#   ifdef ASSERT
2454    bne(flag, cont);
2455    // We have acquired the monitor, check some invariants.
2456    addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2457    // Invariant 1: _recursions should be 0.
2458    //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2459    asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2460                            "monitor->_recursions should be 0", -1);
2461    // Invariant 2: OwnerIsThread shouldn't be 0.
2462    //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
2463    //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
2464    //                           "monitor->OwnerIsThread shouldn't be 0", -1);
2465#   endif
2466
2467#if INCLUDE_RTM_OPT
2468    } // use_rtm()
2469#endif
2470  }
2471
2472  bind(cont);
2473  // flag == EQ indicates success
2474  // flag == NE indicates failure
2475}
2476
2477void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2478                                                 Register temp, Register displaced_header, Register current_header,
2479                                                 bool try_bias, bool use_rtm) {
2480  assert_different_registers(oop, box, temp, displaced_header, current_header);
2481  assert(flag != CCR0, "bad condition register");
2482  Label cont;
2483  Label object_has_monitor;
2484
2485  // Always do locking in runtime.
2486  if (EmitSync & 0x01) {
2487    cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2488    return;
2489  }
2490
2491  if (try_bias) {
2492    biased_locking_exit(flag, oop, current_header, cont);
2493  }
2494
2495#if INCLUDE_RTM_OPT
2496  if (UseRTMForStackLocks && use_rtm) {
2497    assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2498    Label L_regular_unlock;
2499    ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2500    andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2501    cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2502    bne(flag, L_regular_unlock);                                      // else RegularLock
2503    tend_();                                                          // otherwise end...
2504    b(cont);                                                          // ... and we're done
2505    bind(L_regular_unlock);
2506  }
2507#endif
2508
2509  // Find the lock address and load the displaced header from the stack.
2510  ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2511
2512  // If the displaced header is 0, we have a recursive unlock.
2513  cmpdi(flag, displaced_header, 0);
2514  beq(flag, cont);
2515
2516  // Handle existing monitor.
2517  if ((EmitSync & 0x02) == 0) {
2518    // The object has an existing monitor iff (mark & monitor_value) != 0.
2519    RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2520    ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2521    andi_(R0, current_header, markOopDesc::monitor_value);
2522    bne(CCR0, object_has_monitor);
2523  }
2524
2525  // Check if it is still a light weight lock, this is is true if we see
2526  // the stack address of the basicLock in the markOop of the object.
2527  // Cmpxchg sets flag to cmpd(current_header, box).
2528  cmpxchgd(/*flag=*/flag,
2529           /*current_value=*/current_header,
2530           /*compare_value=*/box,
2531           /*exchange_value=*/displaced_header,
2532           /*where=*/oop,
2533           MacroAssembler::MemBarRel,
2534           MacroAssembler::cmpxchgx_hint_release_lock(),
2535           noreg,
2536           &cont);
2537
2538  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2539
2540  // Handle existing monitor.
2541  if ((EmitSync & 0x02) == 0) {
2542    b(cont);
2543
2544    bind(object_has_monitor);
2545    addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2546    ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2547
2548    // It's inflated.
2549#if INCLUDE_RTM_OPT
2550    if (use_rtm) {
2551      Label L_regular_inflated_unlock;
2552      // Clean monitor_value bit to get valid pointer
2553      cmpdi(flag, temp, 0);
2554      bne(flag, L_regular_inflated_unlock);
2555      tend_();
2556      b(cont);
2557      bind(L_regular_inflated_unlock);
2558    }
2559#endif
2560
2561    ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2562    xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2563    orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2564    cmpdi(flag, temp, 0);
2565    bne(flag, cont);
2566
2567    ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2568    ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2569    orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2570    cmpdi(flag, temp, 0);
2571    bne(flag, cont);
2572    release();
2573    std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2574  }
2575
2576  bind(cont);
2577  // flag == EQ indicates success
2578  // flag == NE indicates failure
2579}
2580
2581// Write serialization page so VM thread can do a pseudo remote membar.
2582// We use the current thread pointer to calculate a thread specific
2583// offset to write to within the page. This minimizes bus traffic
2584// due to cache line collision.
2585void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
2586  srdi(tmp2, thread, os::get_serialize_page_shift_count());
2587
2588  int mask = os::vm_page_size() - sizeof(int);
2589  if (Assembler::is_simm(mask, 16)) {
2590    andi(tmp2, tmp2, mask);
2591  } else {
2592    lis(tmp1, (int)((signed short) (mask >> 16)));
2593    ori(tmp1, tmp1, mask & 0x0000ffff);
2594    andr(tmp2, tmp2, tmp1);
2595  }
2596
2597  load_const(tmp1, (long) os::get_memory_serialize_page());
2598  release();
2599  stwx(R0, tmp1, tmp2);
2600}
2601
2602
2603// GC barrier helper macros
2604
2605// Write the card table byte if needed.
2606void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
2607  CardTableModRefBS* bs =
2608    barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
2609  assert(bs->kind() == BarrierSet::CardTableForRS ||
2610         bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
2611#ifdef ASSERT
2612  cmpdi(CCR0, Rnew_val, 0);
2613  asm_assert_ne("null oop not allowed", 0x321);
2614#endif
2615  card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
2616}
2617
2618// Write the card table byte.
2619void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
2620  assert_different_registers(Robj, Rtmp, R0);
2621  load_const_optimized(Rtmp, (address)byte_map_base, R0);
2622  srdi(Robj, Robj, CardTableModRefBS::card_shift);
2623  li(R0, 0); // dirty
2624  if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
2625  stbx(R0, Rtmp, Robj);
2626}
2627
2628#if INCLUDE_ALL_GCS
2629// General G1 pre-barrier generator.
2630// Goal: record the previous value if it is not null.
2631void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
2632                                          Register Rtmp1, Register Rtmp2, bool needs_frame) {
2633  Label runtime, filtered;
2634
2635  // Is marking active?
2636  if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
2637    lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
2638  } else {
2639    guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
2640    lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
2641  }
2642  cmpdi(CCR0, Rtmp1, 0);
2643  beq(CCR0, filtered);
2644
2645  // Do we need to load the previous value?
2646  if (Robj != noreg) {
2647    // Load the previous value...
2648    if (UseCompressedOops) {
2649      lwz(Rpre_val, offset, Robj);
2650    } else {
2651      ld(Rpre_val, offset, Robj);
2652    }
2653    // Previous value has been loaded into Rpre_val.
2654  }
2655  assert(Rpre_val != noreg, "must have a real register");
2656
2657  // Is the previous value null?
2658  cmpdi(CCR0, Rpre_val, 0);
2659  beq(CCR0, filtered);
2660
2661  if (Robj != noreg && UseCompressedOops) {
2662    decode_heap_oop_not_null(Rpre_val);
2663  }
2664
2665  // OK, it's not filtered, so we'll need to call enqueue. In the normal
2666  // case, pre_val will be a scratch G-reg, but there are some cases in
2667  // which it's an O-reg. In the first case, do a normal call. In the
2668  // latter, do a save here and call the frameless version.
2669
2670  // Can we store original value in the thread's buffer?
2671  // Is index == 0?
2672  // (The index field is typed as size_t.)
2673  const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
2674
2675  ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2676  cmpdi(CCR0, Rindex, 0);
2677  beq(CCR0, runtime); // If index == 0, goto runtime.
2678  ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread);
2679
2680  addi(Rindex, Rindex, -wordSize); // Decrement index.
2681  std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2682
2683  // Record the previous value.
2684  stdx(Rpre_val, Rbuffer, Rindex);
2685  b(filtered);
2686
2687  bind(runtime);
2688
2689  // VM call need frame to access(write) O register.
2690  if (needs_frame) {
2691    save_LR_CR(Rtmp1);
2692    push_frame_reg_args(0, Rtmp2);
2693  }
2694
2695  if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
2696  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
2697  if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
2698
2699  if (needs_frame) {
2700    pop_frame();
2701    restore_LR_CR(Rtmp1);
2702  }
2703
2704  bind(filtered);
2705}
2706
2707// General G1 post-barrier generator
2708// Store cross-region card.
2709void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
2710  Label runtime, filtered_int;
2711  Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
2712  assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
2713
2714  G1SATBCardTableLoggingModRefBS* bs =
2715    barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
2716
2717  // Does store cross heap regions?
2718  if (G1RSBarrierRegionFilter) {
2719    xorr(Rtmp1, Rstore_addr, Rnew_val);
2720    srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
2721    beq(CCR0, filtered);
2722  }
2723
2724  // Crosses regions, storing NULL?
2725#ifdef ASSERT
2726  cmpdi(CCR0, Rnew_val, 0);
2727  asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
2728  //beq(CCR0, filtered);
2729#endif
2730
2731  // Storing region crossing non-NULL, is card already dirty?
2732  assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
2733  const Register Rcard_addr = Rtmp1;
2734  Register Rbase = Rtmp2;
2735  load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
2736
2737  srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
2738
2739  // Get the address of the card.
2740  lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
2741  cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
2742  beq(CCR0, filtered);
2743
2744  membar(Assembler::StoreLoad);
2745  lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
2746  cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
2747  beq(CCR0, filtered);
2748
2749  // Storing a region crossing, non-NULL oop, card is clean.
2750  // Dirty card and log.
2751  li(Rtmp3, CardTableModRefBS::dirty_card_val());
2752  //release(); // G1: oops are allowed to get visible after dirty marking.
2753  stbx(Rtmp3, Rbase, Rcard_addr);
2754
2755  add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
2756  Rbase = noreg; // end of lifetime
2757
2758  const Register Rqueue_index = Rtmp2,
2759                 Rqueue_buf   = Rtmp3;
2760  ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2761  cmpdi(CCR0, Rqueue_index, 0);
2762  beq(CCR0, runtime); // index == 0 then jump to runtime
2763  ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread);
2764
2765  addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
2766  std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2767
2768  stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
2769  b(filtered);
2770
2771  bind(runtime);
2772
2773  // Save the live input values.
2774  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
2775
2776  bind(filtered_int);
2777}
2778#endif // INCLUDE_ALL_GCS
2779
2780// Values for last_Java_pc, and last_Java_sp must comply to the rules
2781// in frame_ppc.hpp.
2782void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2783  // Always set last_Java_pc and flags first because once last_Java_sp
2784  // is visible has_last_Java_frame is true and users will look at the
2785  // rest of the fields. (Note: flags should always be zero before we
2786  // get here so doesn't need to be set.)
2787
2788  // Verify that last_Java_pc was zeroed on return to Java
2789  asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2790                          "last_Java_pc not zeroed before leaving Java", 0x200);
2791
2792  // When returning from calling out from Java mode the frame anchor's
2793  // last_Java_pc will always be set to NULL. It is set here so that
2794  // if we are doing a call to native (not VM) that we capture the
2795  // known pc and don't have to rely on the native call having a
2796  // standard frame linkage where we can find the pc.
2797  if (last_Java_pc != noreg)
2798    std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2799
2800  // Set last_Java_sp last.
2801  std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2802}
2803
2804void MacroAssembler::reset_last_Java_frame(void) {
2805  asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2806                             R16_thread, "SP was not set, still zero", 0x202);
2807
2808  BLOCK_COMMENT("reset_last_Java_frame {");
2809  li(R0, 0);
2810
2811  // _last_Java_sp = 0
2812  std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2813
2814  // _last_Java_pc = 0
2815  std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2816  BLOCK_COMMENT("} reset_last_Java_frame");
2817}
2818
2819void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2820  assert_different_registers(sp, tmp1);
2821
2822  // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2823  // TOP_IJAVA_FRAME_ABI.
2824  // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2825#ifdef CC_INTERP
2826  ld(tmp1/*pc*/, _top_ijava_frame_abi(frame_manager_lr), sp);
2827#else
2828  address entry = pc();
2829  load_const_optimized(tmp1, entry);
2830#endif
2831
2832  set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2833}
2834
2835void MacroAssembler::get_vm_result(Register oop_result) {
2836  // Read:
2837  //   R16_thread
2838  //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2839  //
2840  // Updated:
2841  //   oop_result
2842  //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2843
2844  verify_thread();
2845
2846  ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2847  li(R0, 0);
2848  std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2849
2850  verify_oop(oop_result);
2851}
2852
2853void MacroAssembler::get_vm_result_2(Register metadata_result) {
2854  // Read:
2855  //   R16_thread
2856  //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2857  //
2858  // Updated:
2859  //   metadata_result
2860  //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2861
2862  ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2863  li(R0, 0);
2864  std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2865}
2866
2867Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2868  Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2869  if (Universe::narrow_klass_base() != 0) {
2870    // Use dst as temp if it is free.
2871    sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
2872    current = dst;
2873  }
2874  if (Universe::narrow_klass_shift() != 0) {
2875    srdi(dst, current, Universe::narrow_klass_shift());
2876    current = dst;
2877  }
2878  return current;
2879}
2880
2881void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
2882  if (UseCompressedClassPointers) {
2883    Register compressedKlass = encode_klass_not_null(ck, klass);
2884    stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
2885  } else {
2886    std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
2887  }
2888}
2889
2890void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
2891  if (UseCompressedClassPointers) {
2892    if (val == noreg) {
2893      val = R0;
2894      li(val, 0);
2895    }
2896    stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
2897  }
2898}
2899
2900int MacroAssembler::instr_size_for_decode_klass_not_null() {
2901  if (!UseCompressedClassPointers) return 0;
2902  int num_instrs = 1;  // shift or move
2903  if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
2904  return num_instrs * BytesPerInstWord;
2905}
2906
2907void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
2908  assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
2909  if (src == noreg) src = dst;
2910  Register shifted_src = src;
2911  if (Universe::narrow_klass_shift() != 0 ||
2912      Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
2913    shifted_src = dst;
2914    sldi(shifted_src, src, Universe::narrow_klass_shift());
2915  }
2916  if (Universe::narrow_klass_base() != 0) {
2917    add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
2918  }
2919}
2920
2921void MacroAssembler::load_klass(Register dst, Register src) {
2922  if (UseCompressedClassPointers) {
2923    lwz(dst, oopDesc::klass_offset_in_bytes(), src);
2924    // Attention: no null check here!
2925    decode_klass_not_null(dst, dst);
2926  } else {
2927    ld(dst, oopDesc::klass_offset_in_bytes(), src);
2928  }
2929}
2930
2931void MacroAssembler::load_klass_with_trap_null_check(Register dst, Register src) {
2932  if (!os::zero_page_read_protected()) {
2933    if (TrapBasedNullChecks) {
2934      trap_null_check(src);
2935    }
2936  }
2937  load_klass(dst, src);
2938}
2939
2940void MacroAssembler::reinit_heapbase(Register d, Register tmp) {
2941  if (Universe::heap() != NULL) {
2942    load_const_optimized(R30, Universe::narrow_ptrs_base(), tmp);
2943  } else {
2944    // Heap not yet allocated. Load indirectly.
2945    int simm16_offset = load_const_optimized(R30, Universe::narrow_ptrs_base_addr(), tmp, true);
2946    ld(R30, simm16_offset, R30);
2947  }
2948}
2949
2950// Clear Array
2951// Kills both input registers. tmp == R0 is allowed.
2952void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) {
2953  // Procedure for large arrays (uses data cache block zero instruction).
2954    Label startloop, fast, fastloop, small_rest, restloop, done;
2955    const int cl_size         = VM_Version::get_cache_line_size(),
2956              cl_dwords       = cl_size>>3,
2957              cl_dw_addr_bits = exact_log2(cl_dwords),
2958              dcbz_min        = 1;                     // Min count of dcbz executions, needs to be >0.
2959
2960//2:
2961    cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included).
2962    blt(CCR1, small_rest);                                      // Too small.
2963    rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits);           // Extract dword offset within first cache line.
2964    beq(CCR0, fast);                                            // Already 128byte aligned.
2965
2966    subfic(tmp, tmp, cl_dwords);
2967    mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
2968    subf(cnt_dwords, tmp, cnt_dwords); // rest.
2969    li(tmp, 0);
2970//10:
2971  bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
2972    std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
2973    addi(base_ptr, base_ptr, 8);
2974    bdnz(startloop);
2975//13:
2976  bind(fast);                                  // Clear 128byte blocks.
2977    srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
2978    andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
2979    mtctr(tmp);                                // Load counter.
2980//16:
2981  bind(fastloop);
2982    dcbz(base_ptr);                    // Clear 128byte aligned block.
2983    addi(base_ptr, base_ptr, cl_size);
2984    bdnz(fastloop);
2985    if (InsertEndGroupPPC64) { endgroup(); } else { nop(); }
2986//20:
2987  bind(small_rest);
2988    cmpdi(CCR0, cnt_dwords, 0);        // size 0?
2989    beq(CCR0, done);                   // rest == 0
2990    li(tmp, 0);
2991    mtctr(cnt_dwords);                 // Load counter.
2992//24:
2993  bind(restloop);                      // Clear rest.
2994    std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
2995    addi(base_ptr, base_ptr, 8);
2996    bdnz(restloop);
2997//27:
2998  bind(done);
2999}
3000
3001/////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3002
3003// Search for a single jchar in an jchar[].
3004//
3005// Assumes that result differs from all other registers.
3006//
3007// Haystack, needle are the addresses of jchar-arrays.
3008// NeedleChar is needle[0] if it is known at compile time.
3009// Haycnt is the length of the haystack. We assume haycnt >=1.
3010//
3011// Preserves haystack, haycnt, kills all other registers.
3012//
3013// If needle == R0, we search for the constant needleChar.
3014void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt,
3015                                      Register needle, jchar needleChar,
3016                                      Register tmp1, Register tmp2) {
3017
3018  assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2);
3019
3020  Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End;
3021  Register needle0 = needle, // Contains needle[0].
3022           addr = tmp1,
3023           ch1 = tmp2,
3024           ch2 = R0;
3025
3026//2 (variable) or 3 (const):
3027   if (needle != R0) lhz(needle0, 0, needle); // Preload needle character, needle has len==1.
3028   dcbtct(haystack, 0x00);                        // Indicate R/O access to haystack.
3029
3030   srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3031   mr(addr, haystack);
3032   beq(CCR0, L_FinalCheck);
3033   mtctr(tmp2);              // Move to count register.
3034//8:
3035  bind(L_InnerLoop);             // Main work horse (2x unrolled search loop).
3036   lhz(ch1, 0, addr);        // Load characters from haystack.
3037   lhz(ch2, 2, addr);
3038   (needle != R0) ? cmpw(CCR0, ch1, needle0) : cmplwi(CCR0, ch1, needleChar);
3039   (needle != R0) ? cmpw(CCR1, ch2, needle0) : cmplwi(CCR1, ch2, needleChar);
3040   beq(CCR0, L_Found1);   // Did we find the needle?
3041   beq(CCR1, L_Found2);
3042   addi(addr, addr, 4);
3043   bdnz(L_InnerLoop);
3044//16:
3045  bind(L_FinalCheck);
3046   andi_(R0, haycnt, 1);
3047   beq(CCR0, L_NotFound);
3048   lhz(ch1, 0, addr);        // One position left at which we have to compare.
3049   (needle != R0) ? cmpw(CCR1, ch1, needle0) : cmplwi(CCR1, ch1, needleChar);
3050   beq(CCR1, L_Found3);
3051//21:
3052  bind(L_NotFound);
3053   li(result, -1);           // Not found.
3054   b(L_End);
3055
3056  bind(L_Found2);
3057   addi(addr, addr, 2);
3058//24:
3059  bind(L_Found1);
3060  bind(L_Found3);                  // Return index ...
3061   subf(addr, haystack, addr); // relative to haystack,
3062   srdi(result, addr, 1);      // in characters.
3063  bind(L_End);
3064}
3065
3066
3067// Implementation of IndexOf for jchar arrays.
3068//
3069// The length of haystack and needle are not constant, i.e. passed in a register.
3070//
3071// Preserves registers haystack, needle.
3072// Kills registers haycnt, needlecnt.
3073// Assumes that result differs from all other registers.
3074// Haystack, needle are the addresses of jchar-arrays.
3075// Haycnt, needlecnt are the lengths of them, respectively.
3076//
3077// Needlecntval must be zero or 15-bit unsigned immediate and > 1.
3078void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3079                                    Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3080                                    Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
3081
3082  // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3083  Label L_TooShort, L_Found, L_NotFound, L_End;
3084  Register last_addr = haycnt, // Kill haycnt at the beginning.
3085           addr      = tmp1,
3086           n_start   = tmp2,
3087           ch1       = tmp3,
3088           ch2       = R0;
3089
3090  // **************************************************************************************************
3091  // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3092  // **************************************************************************************************
3093
3094//1 (variable) or 3 (const):
3095   dcbtct(needle, 0x00);    // Indicate R/O access to str1.
3096   dcbtct(haystack, 0x00);  // Indicate R/O access to str2.
3097
3098  // Compute last haystack addr to use if no match gets found.
3099  if (needlecntval == 0) { // variable needlecnt
3100//3:
3101   subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3102   addi(addr, haystack, -2);          // Accesses use pre-increment.
3103   cmpwi(CCR6, needlecnt, 2);
3104   blt(CCR6, L_TooShort);          // Variable needlecnt: handle short needle separately.
3105   slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3106   lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3107   add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3108   addi(needlecnt, needlecnt, -2);    // Rest of needle.
3109  } else { // constant needlecnt
3110  guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3111  assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3112//5:
3113   addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3114   lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3115   addi(addr, haystack, -2);          // Accesses use pre-increment.
3116   slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3117   add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3118   li(needlecnt, needlecntval-2);     // Rest of needle.
3119  }
3120
3121  // Main Loop (now we have at least 3 characters).
3122//11:
3123  Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3;
3124  bind(L_OuterLoop); // Search for 1st 2 characters.
3125  Register addr_diff = tmp4;
3126   subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check.
3127   addi(addr, addr, 2);              // This is the new address we want to use for comparing.
3128   srdi_(ch2, addr_diff, 2);
3129   beq(CCR0, L_FinalCheck);       // 2 characters left?
3130   mtctr(ch2);                       // addr_diff/4
3131//16:
3132  bind(L_InnerLoop);                // Main work horse (2x unrolled search loop)
3133   lwz(ch1, 0, addr);           // Load 2 characters of haystack (ignore alignment).
3134   lwz(ch2, 2, addr);
3135   cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3136   cmpw(CCR1, ch2, n_start);
3137   beq(CCR0, L_Comp1);       // Did we find the needle start?
3138   beq(CCR1, L_Comp2);
3139   addi(addr, addr, 4);
3140   bdnz(L_InnerLoop);
3141//24:
3142  bind(L_FinalCheck);
3143   rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1.
3144   beq(CCR0, L_NotFound);
3145   lwz(ch1, 0, addr);                       // One position left at which we have to compare.
3146   cmpw(CCR1, ch1, n_start);
3147   beq(CCR1, L_Comp3);
3148//29:
3149  bind(L_NotFound);
3150   li(result, -1); // not found
3151   b(L_End);
3152
3153
3154   // **************************************************************************************************
3155   // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3156   // **************************************************************************************************
3157//31:
3158 if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size.
3159  int nopcnt = 5;
3160  if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below).
3161  if (needlecntval == 0) {         // We have to handle these cases separately.
3162  Label L_OneCharLoop;
3163  bind(L_TooShort);
3164   mtctr(haycnt);
3165   lhz(n_start, 0, needle);    // First character of needle
3166  bind(L_OneCharLoop);
3167   lhzu(ch1, 2, addr);
3168   cmpw(CCR1, ch1, n_start);
3169   beq(CCR1, L_Found);      // Did we find the one character needle?
3170   bdnz(L_OneCharLoop);
3171   li(result, -1);             // Not found.
3172   b(L_End);
3173  } // 8 instructions, so no impact on alignment.
3174  for (int x = 0; x < nopcnt; ++x) nop();
3175 }
3176
3177  // **************************************************************************************************
3178  // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3179  // **************************************************************************************************
3180
3181  // Compare the rest
3182//36 if needlecntval==0, else 37:
3183  bind(L_Comp2);
3184   addi(addr, addr, 2); // First comparison has failed, 2nd one hit.
3185  bind(L_Comp1);            // Addr points to possible needle start.
3186  bind(L_Comp3);            // Could have created a copy and use a different return address but saving code size here.
3187  if (needlecntval != 2) {  // Const needlecnt==2?
3188   if (needlecntval != 3) {
3189    if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2?
3190    Register ind_reg = tmp4;
3191    li(ind_reg, 2*2);   // First 2 characters are already compared, use index 2.
3192    mtctr(needlecnt);   // Decremented by 2, still > 0.
3193//40:
3194   Label L_CompLoop;
3195   bind(L_CompLoop);
3196    lhzx(ch2, needle, ind_reg);
3197    lhzx(ch1, addr, ind_reg);
3198    cmpw(CCR1, ch1, ch2);
3199    bne(CCR1, L_OuterLoop);
3200    addi(ind_reg, ind_reg, 2);
3201    bdnz(L_CompLoop);
3202   } else { // No loop required if there's only one needle character left.
3203    lhz(ch2, 2*2, needle);
3204    lhz(ch1, 2*2, addr);
3205    cmpw(CCR1, ch1, ch2);
3206    bne(CCR1, L_OuterLoop);
3207   }
3208  }
3209  // Return index ...
3210//46:
3211  bind(L_Found);
3212   subf(addr, haystack, addr); // relative to haystack, ...
3213   srdi(result, addr, 1);      // in characters.
3214//48:
3215  bind(L_End);
3216}
3217
3218// Implementation of Compare for jchar arrays.
3219//
3220// Kills the registers str1, str2, cnt1, cnt2.
3221// Kills cr0, ctr.
3222// Assumes that result differes from the input registers.
3223void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg,
3224                                    Register result_reg, Register tmp_reg) {
3225   assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg);
3226
3227   Label Ldone, Lslow_case, Lslow_loop, Lfast_loop;
3228   Register cnt_diff = R0,
3229            limit_reg = cnt1_reg,
3230            chr1_reg = result_reg,
3231            chr2_reg = cnt2_reg,
3232            addr_diff = str2_reg;
3233
3234   // Offset 0 should be 32 byte aligned.
3235//-4:
3236    dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3237    dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3238//-2:
3239   // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters).
3240    subf(result_reg, cnt2_reg, cnt1_reg);  // difference between cnt1/2
3241    subf_(addr_diff, str1_reg, str2_reg);  // alias?
3242    beq(CCR0, Ldone);                   // return cnt difference if both ones are identical
3243    srawi(limit_reg, result_reg, 31);      // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow)
3244    mr(cnt_diff, result_reg);
3245    andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0
3246    add_(limit_reg, cnt2_reg, limit_reg);  // min(cnt1, cnt2)==0?
3247    beq(CCR0, Ldone);                   // return cnt difference if one has 0 length
3248
3249    lhz(chr1_reg, 0, str1_reg);            // optional: early out if first characters mismatch
3250    lhzx(chr2_reg, str1_reg, addr_diff);   // optional: early out if first characters mismatch
3251    addi(tmp_reg, limit_reg, -1);          // min(cnt1, cnt2)-1
3252    subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch
3253    bne(CCR0, Ldone);                   // optional: early out if first characters mismatch
3254
3255   // Set loop counter by scaling down tmp_reg
3256    srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4
3257    ble(CCR0, Lslow_case);                 // need >4 characters for fast loop
3258    andi(limit_reg, tmp_reg, 4-1);            // remaining characters
3259
3260   // Adapt str1_reg str2_reg for the first loop iteration
3261    mtctr(chr2_reg);                 // (min(cnt1, cnt2)-1)/4
3262    addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop
3263//16:
3264   // Compare the rest of the characters
3265   bind(Lfast_loop);
3266    ld(chr1_reg, 0, str1_reg);
3267    ldx(chr2_reg, str1_reg, addr_diff);
3268    cmpd(CCR0, chr2_reg, chr1_reg);
3269    bne(CCR0, Lslow_case); // return chr1_reg
3270    addi(str1_reg, str1_reg, 4*2);
3271    bdnz(Lfast_loop);
3272    addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing
3273//23:
3274   bind(Lslow_case);
3275    mtctr(limit_reg);
3276//24:
3277   bind(Lslow_loop);
3278    lhz(chr1_reg, 0, str1_reg);
3279    lhzx(chr2_reg, str1_reg, addr_diff);
3280    subf_(result_reg, chr2_reg, chr1_reg);
3281    bne(CCR0, Ldone); // return chr1_reg
3282    addi(str1_reg, str1_reg, 1*2);
3283    bdnz(Lslow_loop);
3284//30:
3285   // If strings are equal up to min length, return the length difference.
3286    mr(result_reg, cnt_diff);
3287    nop(); // alignment
3288//32:
3289   // Otherwise, return the difference between the first mismatched chars.
3290   bind(Ldone);
3291}
3292
3293
3294// Compare char[] arrays.
3295//
3296// str1_reg   USE only
3297// str2_reg   USE only
3298// cnt_reg    USE_DEF, due to tmp reg shortage
3299// result_reg DEF only, might compromise USE only registers
3300void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg,
3301                                        Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg,
3302                                        Register tmp5_reg) {
3303
3304  // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3305  assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3306  assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3307
3308  // Offset 0 should be 32 byte aligned.
3309  Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false;
3310  Register index_reg = tmp5_reg;
3311  Register cbc_iter  = tmp4_reg;
3312
3313//-1:
3314  dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3315  dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3316//1:
3317  andi(cbc_iter, cnt_reg, 4-1);            // Remaining iterations after 4 java characters per iteration loop.
3318  li(index_reg, 0); // init
3319  li(result_reg, 0); // assume false
3320  srwi_(tmp2_reg, cnt_reg, exact_log2(4)); // Div: 4 java characters per iteration (main loop).
3321
3322  cmpwi(CCR1, cbc_iter, 0);             // CCR1 = (cbc_iter==0)
3323  beq(CCR0, Linit_cbc);                 // too short
3324    mtctr(tmp2_reg);
3325//8:
3326    bind(Lloop);
3327      ldx(tmp1_reg, str1_reg, index_reg);
3328      ldx(tmp2_reg, str2_reg, index_reg);
3329      cmpd(CCR0, tmp1_reg, tmp2_reg);
3330      bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3331      addi(index_reg, index_reg, 4*sizeof(jchar));
3332      bdnz(Lloop);
3333//14:
3334  bind(Linit_cbc);
3335  beq(CCR1, Ldone_true);
3336    mtctr(cbc_iter);
3337//16:
3338    bind(Lcbc);
3339      lhzx(tmp1_reg, str1_reg, index_reg);
3340      lhzx(tmp2_reg, str2_reg, index_reg);
3341      cmpw(CCR0, tmp1_reg, tmp2_reg);
3342      bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3343      addi(index_reg, index_reg, 1*sizeof(jchar));
3344      bdnz(Lcbc);
3345    nop();
3346  bind(Ldone_true);
3347  li(result_reg, 1);
3348//24:
3349  bind(Ldone_false);
3350}
3351
3352
3353void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,
3354                                           Register tmp1_reg, Register tmp2_reg) {
3355  // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3356  assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg);
3357  assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg);
3358  assert(sizeof(jchar) == 2, "must be");
3359  assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate");
3360
3361  Label Ldone_false;
3362
3363  if (cntval < 16) { // short case
3364    if (cntval != 0) li(result_reg, 0); // assume false
3365
3366    const int num_bytes = cntval*sizeof(jchar);
3367    int index = 0;
3368    for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) {
3369      ld(tmp1_reg, index, str1_reg);
3370      ld(tmp2_reg, index, str2_reg);
3371      cmpd(CCR0, tmp1_reg, tmp2_reg);
3372      bne(CCR0, Ldone_false);
3373    }
3374    if (cntval & 2) {
3375      lwz(tmp1_reg, index, str1_reg);
3376      lwz(tmp2_reg, index, str2_reg);
3377      cmpw(CCR0, tmp1_reg, tmp2_reg);
3378      bne(CCR0, Ldone_false);
3379      index += 4;
3380    }
3381    if (cntval & 1) {
3382      lhz(tmp1_reg, index, str1_reg);
3383      lhz(tmp2_reg, index, str2_reg);
3384      cmpw(CCR0, tmp1_reg, tmp2_reg);
3385      bne(CCR0, Ldone_false);
3386    }
3387    // fallthrough: true
3388  } else {
3389    Label Lloop;
3390    Register index_reg = tmp1_reg;
3391    const int loopcnt = cntval/4;
3392    assert(loopcnt > 0, "must be");
3393    // Offset 0 should be 32 byte aligned.
3394    //2:
3395    dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3396    dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3397    li(tmp2_reg, loopcnt);
3398    li(index_reg, 0); // init
3399    li(result_reg, 0); // assume false
3400    mtctr(tmp2_reg);
3401    //8:
3402    bind(Lloop);
3403    ldx(R0, str1_reg, index_reg);
3404    ldx(tmp2_reg, str2_reg, index_reg);
3405    cmpd(CCR0, R0, tmp2_reg);
3406    bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3407    addi(index_reg, index_reg, 4*sizeof(jchar));
3408    bdnz(Lloop);
3409    //14:
3410    if (cntval & 2) {
3411      lwzx(R0, str1_reg, index_reg);
3412      lwzx(tmp2_reg, str2_reg, index_reg);
3413      cmpw(CCR0, R0, tmp2_reg);
3414      bne(CCR0, Ldone_false);
3415      if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
3416    }
3417    if (cntval & 1) {
3418      lhzx(R0, str1_reg, index_reg);
3419      lhzx(tmp2_reg, str2_reg, index_reg);
3420      cmpw(CCR0, R0, tmp2_reg);
3421      bne(CCR0, Ldone_false);
3422    }
3423    // fallthru: true
3424  }
3425  li(result_reg, 1);
3426  bind(Ldone_false);
3427}
3428
3429// Helpers for Intrinsic Emitters
3430//
3431// Revert the byte order of a 32bit value in a register
3432//   src: 0x44556677
3433//   dst: 0x77665544
3434// Three steps to obtain the result:
3435//  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3436//     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3437//     This value initializes dst.
3438//  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3439//     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3440//     This value is mask inserted into dst with a [0..23] mask of 1s.
3441//  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3442//     This value is mask inserted into dst with a [8..15] mask of 1s.
3443void MacroAssembler::load_reverse_32(Register dst, Register src) {
3444  assert_different_registers(dst, src);
3445
3446  rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3447  rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3448  rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3449}
3450
3451// Calculate the column addresses of the crc32 lookup table into distinct registers.
3452// This loop-invariant calculation is moved out of the loop body, reducing the loop
3453// body size from 20 to 16 instructions.
3454// Returns the offset that was used to calculate the address of column tc3.
3455// Due to register shortage, setting tc3 may overwrite table. With the return offset
3456// at hand, the original table address can be easily reconstructed.
3457int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3458
3459#ifdef VM_LITTLE_ENDIAN
3460  // This is what we implement (the DOLIT4 part):
3461  // ========================================================================= */
3462  // #define DOLIT4 c ^= *buf4++; \
3463  //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3464  //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3465  // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3466  // ========================================================================= */
3467  const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
3468  const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
3469  const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
3470  const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
3471#else
3472  // This is what we implement (the DOBIG4 part):
3473  // =========================================================================
3474  // #define DOBIG4 c ^= *++buf4; \
3475  //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3476  //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3477  // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3478  // =========================================================================
3479  const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
3480  const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
3481  const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
3482  const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
3483#endif
3484  assert_different_registers(table, tc0, tc1, tc2);
3485  assert(table == tc3, "must be!");
3486
3487  if (ix0 != 0) addi(tc0, table, ix0);
3488  if (ix1 != 0) addi(tc1, table, ix1);
3489  if (ix2 != 0) addi(tc2, table, ix2);
3490  if (ix3 != 0) addi(tc3, table, ix3);
3491
3492  return ix3;
3493}
3494
3495/**
3496 * uint32_t crc;
3497 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3498 */
3499void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3500  assert_different_registers(crc, table, tmp);
3501  assert_different_registers(val, table);
3502
3503  if (crc == val) {                   // Must rotate first to use the unmodified value.
3504    rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3505                                      // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3506    srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3507  } else {
3508    srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3509    rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3510  }
3511  lwzx(tmp, table, tmp);
3512  xorr(crc, crc, tmp);
3513}
3514
3515/**
3516 * uint32_t crc;
3517 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3518 */
3519void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
3520  fold_byte_crc32(crc, crc, table, tmp);
3521}
3522
3523/**
3524 * Emits code to update CRC-32 with a byte value according to constants in table.
3525 *
3526 * @param [in,out]crc   Register containing the crc.
3527 * @param [in]val       Register containing the byte to fold into the CRC.
3528 * @param [in]table     Register containing the table of crc constants.
3529 *
3530 * uint32_t crc;
3531 * val = crc_table[(val ^ crc) & 0xFF];
3532 * crc = val ^ (crc >> 8);
3533 */
3534void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3535  BLOCK_COMMENT("update_byte_crc32:");
3536  xorr(val, val, crc);
3537  fold_byte_crc32(crc, val, table, val);
3538}
3539
3540/**
3541 * @param crc   register containing existing CRC (32-bit)
3542 * @param buf   register pointing to input byte buffer (byte*)
3543 * @param len   register containing number of bytes
3544 * @param table register pointing to CRC table
3545 */
3546void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3547                                           Register data, bool loopAlignment, bool invertCRC) {
3548  assert_different_registers(crc, buf, len, table, data);
3549
3550  Label L_mainLoop, L_done;
3551  const int mainLoop_stepping  = 1;
3552  const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3553
3554  // Process all bytes in a single-byte loop.
3555  cmpdi(CCR0, len, 0);                           // Anything to do?
3556  mtctr(len);
3557  beq(CCR0, L_done);
3558
3559  if (invertCRC) {
3560    nand(crc, crc, crc);                         // ~c
3561  }
3562
3563  align(mainLoop_alignment);
3564  BIND(L_mainLoop);
3565    lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3566    addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3567    update_byte_crc32(crc, data, table);
3568    bdnz(L_mainLoop);                            // Iterate.
3569
3570  if (invertCRC) {
3571    nand(crc, crc, crc);                         // ~c
3572  }
3573
3574  bind(L_done);
3575}
3576
3577/**
3578 * Emits code to update CRC-32 with a 4-byte value according to constants in table
3579 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3580 */
3581// A not on the lookup table address(es):
3582// The lookup table consists of two sets of four columns each.
3583// The columns {0..3} are used for little-endian machines.
3584// The columns {4..7} are used for big-endian machines.
3585// To save the effort of adding the column offset to the table address each time
3586// a table element is looked up, it is possible to pass the pre-calculated
3587// column addresses.
3588// Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3589void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3590                                        Register t0,  Register t1,  Register t2,  Register t3,
3591                                        Register tc0, Register tc1, Register tc2, Register tc3) {
3592  assert_different_registers(crc, t3);
3593
3594  // XOR crc with next four bytes of buffer.
3595  lwz(t3, bufDisp, buf);
3596  if (bufInc != 0) {
3597    addi(buf, buf, bufInc);
3598  }
3599  xorr(t3, t3, crc);
3600
3601  // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3602  rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3603  rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3604  rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3605  rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3606
3607  // Use the pre-calculated column addresses.
3608  // Load pre-calculated table values.
3609  lwzx(t0, tc0, t0);
3610  lwzx(t1, tc1, t1);
3611  lwzx(t2, tc2, t2);
3612  lwzx(t3, tc3, t3);
3613
3614  // Calculate new crc from table values.
3615  xorr(t0,  t0, t1);
3616  xorr(t2,  t2, t3);
3617  xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3618}
3619
3620/**
3621 * @param crc   register containing existing CRC (32-bit)
3622 * @param buf   register pointing to input byte buffer (byte*)
3623 * @param len   register containing number of bytes
3624 * @param table register pointing to CRC table
3625 *
3626 * Uses R9..R12 as work register. Must be saved/restored by caller!
3627 */
3628void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
3629                                        Register t0,  Register t1,  Register t2,  Register t3,
3630                                        Register tc0, Register tc1, Register tc2, Register tc3) {
3631  assert_different_registers(crc, buf, len, table);
3632
3633  Label L_mainLoop, L_tail;
3634  Register  tmp  = t0;
3635  Register  data = t0;
3636  Register  tmp2 = t1;
3637  const int mainLoop_stepping  = 8;
3638  const int tailLoop_stepping  = 1;
3639  const int log_stepping       = exact_log2(mainLoop_stepping);
3640  const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3641  const int complexThreshold   = 2*mainLoop_stepping;
3642
3643  // Don't test for len <= 0 here. This pathological case should not occur anyway.
3644  // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3645  // The situation itself is detected and handled correctly by the conditional branches
3646  // following  aghi(len, -stepping) and aghi(len, +stepping).
3647  assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3648
3649  BLOCK_COMMENT("kernel_crc32_2word {");
3650
3651  nand(crc, crc, crc);                           // ~c
3652
3653  // Check for short (<mainLoop_stepping) buffer.
3654  cmpdi(CCR0, len, complexThreshold);
3655  blt(CCR0, L_tail);
3656
3657  // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3658  // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3659  {
3660    // Align buf addr to mainLoop_stepping boundary.
3661    neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
3662    rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3663
3664    if (complexThreshold > mainLoop_stepping) {
3665      sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3666    } else {
3667      sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3668      cmpdi(CCR0, tmp, mainLoop_stepping);
3669      blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3670      mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3671    }
3672    update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3673  }
3674
3675  srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3676  andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3677  mtctr(tmp2);
3678
3679#ifdef VM_LITTLE_ENDIAN
3680  Register crc_rv = crc;
3681#else
3682  Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3683                                                 // Occupies tmp, but frees up crc.
3684  load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3685  tmp = crc;
3686#endif
3687
3688  int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3689
3690  align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3691  BIND(L_mainLoop);
3692    update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3693    update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3694    bdnz(L_mainLoop);
3695
3696#ifndef VM_LITTLE_ENDIAN
3697  load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3698  tmp = crc_rv;                                  // Tmp uses it's original register again.
3699#endif
3700
3701  // Restore original table address for tailLoop.
3702  if (reconstructTableOffset != 0) {
3703    addi(table, table, -reconstructTableOffset);
3704  }
3705
3706  // Process last few (<complexThreshold) bytes of buffer.
3707  BIND(L_tail);
3708  update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3709
3710  nand(crc, crc, crc);                           // ~c
3711  BLOCK_COMMENT("} kernel_crc32_2word");
3712}
3713
3714/**
3715 * @param crc   register containing existing CRC (32-bit)
3716 * @param buf   register pointing to input byte buffer (byte*)
3717 * @param len   register containing number of bytes
3718 * @param table register pointing to CRC table
3719 *
3720 * uses R9..R12 as work register. Must be saved/restored by caller!
3721 */
3722void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3723                                        Register t0,  Register t1,  Register t2,  Register t3,
3724                                        Register tc0, Register tc1, Register tc2, Register tc3) {
3725  assert_different_registers(crc, buf, len, table);
3726
3727  Label L_mainLoop, L_tail;
3728  Register  tmp          = t0;
3729  Register  data         = t0;
3730  Register  tmp2         = t1;
3731  const int mainLoop_stepping  = 4;
3732  const int tailLoop_stepping  = 1;
3733  const int log_stepping       = exact_log2(mainLoop_stepping);
3734  const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3735  const int complexThreshold   = 2*mainLoop_stepping;
3736
3737  // Don't test for len <= 0 here. This pathological case should not occur anyway.
3738  // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3739  // The situation itself is detected and handled correctly by the conditional branches
3740  // following  aghi(len, -stepping) and aghi(len, +stepping).
3741  assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3742
3743  BLOCK_COMMENT("kernel_crc32_1word {");
3744
3745  nand(crc, crc, crc);                           // ~c
3746
3747  // Check for short (<mainLoop_stepping) buffer.
3748  cmpdi(CCR0, len, complexThreshold);
3749  blt(CCR0, L_tail);
3750
3751  // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3752  // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3753  {
3754    // Align buf addr to mainLoop_stepping boundary.
3755    neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3756    rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3757
3758    if (complexThreshold > mainLoop_stepping) {
3759      sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3760    } else {
3761      sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3762      cmpdi(CCR0, tmp, mainLoop_stepping);
3763      blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3764      mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3765    }
3766    update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3767  }
3768
3769  srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3770  andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3771  mtctr(tmp2);
3772
3773#ifdef VM_LITTLE_ENDIAN
3774  Register crc_rv = crc;
3775#else
3776  Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3777                                                 // Occupies tmp, but frees up crc.
3778  load_reverse_32(crc_rv, crc);                  // evert byte order because we are dealing with big-endian data.
3779  tmp = crc;
3780#endif
3781
3782  int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3783
3784  align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3785  BIND(L_mainLoop);
3786    update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3787    bdnz(L_mainLoop);
3788
3789#ifndef VM_LITTLE_ENDIAN
3790  load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3791  tmp = crc_rv;                                  // Tmp uses it's original register again.
3792#endif
3793
3794  // Restore original table address for tailLoop.
3795  if (reconstructTableOffset != 0) {
3796    addi(table, table, -reconstructTableOffset);
3797  }
3798
3799  // Process last few (<complexThreshold) bytes of buffer.
3800  BIND(L_tail);
3801  update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3802
3803  nand(crc, crc, crc);                           // ~c
3804  BLOCK_COMMENT("} kernel_crc32_1word");
3805}
3806
3807/**
3808 * @param crc   register containing existing CRC (32-bit)
3809 * @param buf   register pointing to input byte buffer (byte*)
3810 * @param len   register containing number of bytes
3811 * @param table register pointing to CRC table
3812 *
3813 * Uses R7_ARG5, R8_ARG6 as work registers.
3814 */
3815void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
3816                                        Register t0,  Register t1,  Register t2,  Register t3) {
3817  assert_different_registers(crc, buf, len, table);
3818
3819  Register  data = t0;                   // Holds the current byte to be folded into crc.
3820
3821  BLOCK_COMMENT("kernel_crc32_1byte {");
3822
3823  // Process all bytes in a single-byte loop.
3824  update_byteLoop_crc32(crc, buf, len, table, data, true, true);
3825
3826  BLOCK_COMMENT("} kernel_crc32_1byte");
3827}
3828
3829void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
3830  assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
3831
3832  BLOCK_COMMENT("kernel_crc32_singleByte:");
3833  nand(crc, crc, crc);       // ~c
3834
3835  lbz(tmp, 0, buf);          // Byte from buffer, zero-extended.
3836  update_byte_crc32(crc, tmp, table);
3837
3838  nand(crc, crc, crc);       // ~c
3839}
3840
3841// dest_lo += src1 + src2
3842// dest_hi += carry1 + carry2
3843void MacroAssembler::add2_with_carry(Register dest_hi,
3844                                     Register dest_lo,
3845                                     Register src1, Register src2) {
3846  li(R0, 0);
3847  addc(dest_lo, dest_lo, src1);
3848  adde(dest_hi, dest_hi, R0);
3849  addc(dest_lo, dest_lo, src2);
3850  adde(dest_hi, dest_hi, R0);
3851}
3852
3853// Multiply 64 bit by 64 bit first loop.
3854void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3855                                           Register x_xstart,
3856                                           Register y, Register y_idx,
3857                                           Register z,
3858                                           Register carry,
3859                                           Register product_high, Register product,
3860                                           Register idx, Register kdx,
3861                                           Register tmp) {
3862  //  jlong carry, x[], y[], z[];
3863  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3864  //    huge_128 product = y[idx] * x[xstart] + carry;
3865  //    z[kdx] = (jlong)product;
3866  //    carry  = (jlong)(product >>> 64);
3867  //  }
3868  //  z[xstart] = carry;
3869
3870  Label L_first_loop, L_first_loop_exit;
3871  Label L_one_x, L_one_y, L_multiply;
3872
3873  addic_(xstart, xstart, -1);
3874  blt(CCR0, L_one_x);   // Special case: length of x is 1.
3875
3876  // Load next two integers of x.
3877  sldi(tmp, xstart, LogBytesPerInt);
3878  ldx(x_xstart, x, tmp);
3879#ifdef VM_LITTLE_ENDIAN
3880  rldicl(x_xstart, x_xstart, 32, 0);
3881#endif
3882
3883  align(32, 16);
3884  bind(L_first_loop);
3885
3886  cmpdi(CCR0, idx, 1);
3887  blt(CCR0, L_first_loop_exit);
3888  addi(idx, idx, -2);
3889  beq(CCR0, L_one_y);
3890
3891  // Load next two integers of y.
3892  sldi(tmp, idx, LogBytesPerInt);
3893  ldx(y_idx, y, tmp);
3894#ifdef VM_LITTLE_ENDIAN
3895  rldicl(y_idx, y_idx, 32, 0);
3896#endif
3897
3898
3899  bind(L_multiply);
3900  multiply64(product_high, product, x_xstart, y_idx);
3901
3902  li(tmp, 0);
3903  addc(product, product, carry);         // Add carry to result.
3904  adde(product_high, product_high, tmp); // Add carry of the last addition.
3905  addi(kdx, kdx, -2);
3906
3907  // Store result.
3908#ifdef VM_LITTLE_ENDIAN
3909  rldicl(product, product, 32, 0);
3910#endif
3911  sldi(tmp, kdx, LogBytesPerInt);
3912  stdx(product, z, tmp);
3913  mr_if_needed(carry, product_high);
3914  b(L_first_loop);
3915
3916
3917  bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3918
3919  lwz(y_idx, 0, y);
3920  b(L_multiply);
3921
3922
3923  bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3924
3925  lwz(x_xstart, 0, x);
3926  b(L_first_loop);
3927
3928  bind(L_first_loop_exit);
3929}
3930
3931// Multiply 64 bit by 64 bit and add 128 bit.
3932void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3933                                            Register z, Register yz_idx,
3934                                            Register idx, Register carry,
3935                                            Register product_high, Register product,
3936                                            Register tmp, int offset) {
3937
3938  //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3939  //  z[kdx] = (jlong)product;
3940
3941  sldi(tmp, idx, LogBytesPerInt);
3942  if (offset) {
3943    addi(tmp, tmp, offset);
3944  }
3945  ldx(yz_idx, y, tmp);
3946#ifdef VM_LITTLE_ENDIAN
3947  rldicl(yz_idx, yz_idx, 32, 0);
3948#endif
3949
3950  multiply64(product_high, product, x_xstart, yz_idx);
3951  ldx(yz_idx, z, tmp);
3952#ifdef VM_LITTLE_ENDIAN
3953  rldicl(yz_idx, yz_idx, 32, 0);
3954#endif
3955
3956  add2_with_carry(product_high, product, carry, yz_idx);
3957
3958  sldi(tmp, idx, LogBytesPerInt);
3959  if (offset) {
3960    addi(tmp, tmp, offset);
3961  }
3962#ifdef VM_LITTLE_ENDIAN
3963  rldicl(product, product, 32, 0);
3964#endif
3965  stdx(product, z, tmp);
3966}
3967
3968// Multiply 128 bit by 128 bit. Unrolled inner loop.
3969void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3970                                             Register y, Register z,
3971                                             Register yz_idx, Register idx, Register carry,
3972                                             Register product_high, Register product,
3973                                             Register carry2, Register tmp) {
3974
3975  //  jlong carry, x[], y[], z[];
3976  //  int kdx = ystart+1;
3977  //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3978  //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3979  //    z[kdx+idx+1] = (jlong)product;
3980  //    jlong carry2 = (jlong)(product >>> 64);
3981  //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3982  //    z[kdx+idx] = (jlong)product;
3983  //    carry = (jlong)(product >>> 64);
3984  //  }
3985  //  idx += 2;
3986  //  if (idx > 0) {
3987  //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3988  //    z[kdx+idx] = (jlong)product;
3989  //    carry = (jlong)(product >>> 64);
3990  //  }
3991
3992  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3993  const Register jdx = R0;
3994
3995  // Scale the index.
3996  srdi_(jdx, idx, 2);
3997  beq(CCR0, L_third_loop_exit);
3998  mtctr(jdx);
3999
4000  align(32, 16);
4001  bind(L_third_loop);
4002
4003  addi(idx, idx, -4);
4004
4005  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4006  mr_if_needed(carry2, product_high);
4007
4008  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4009  mr_if_needed(carry, product_high);
4010  bdnz(L_third_loop);
4011
4012  bind(L_third_loop_exit);  // Handle any left-over operand parts.
4013
4014  andi_(idx, idx, 0x3);
4015  beq(CCR0, L_post_third_loop_done);
4016
4017  Label L_check_1;
4018
4019  addic_(idx, idx, -2);
4020  blt(CCR0, L_check_1);
4021
4022  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4023  mr_if_needed(carry, product_high);
4024
4025  bind(L_check_1);
4026
4027  addi(idx, idx, 0x2);
4028  andi_(idx, idx, 0x1) ;
4029  addic_(idx, idx, -1);
4030  blt(CCR0, L_post_third_loop_done);
4031
4032  sldi(tmp, idx, LogBytesPerInt);
4033  lwzx(yz_idx, y, tmp);
4034  multiply64(product_high, product, x_xstart, yz_idx);
4035  lwzx(yz_idx, z, tmp);
4036
4037  add2_with_carry(product_high, product, yz_idx, carry);
4038
4039  sldi(tmp, idx, LogBytesPerInt);
4040  stwx(product, z, tmp);
4041  srdi(product, product, 32);
4042
4043  sldi(product_high, product_high, 32);
4044  orr(product, product, product_high);
4045  mr_if_needed(carry, product);
4046
4047  bind(L_post_third_loop_done);
4048}   // multiply_128_x_128_loop
4049
4050void MacroAssembler::multiply_to_len(Register x, Register xlen,
4051                                     Register y, Register ylen,
4052                                     Register z, Register zlen,
4053                                     Register tmp1, Register tmp2,
4054                                     Register tmp3, Register tmp4,
4055                                     Register tmp5, Register tmp6,
4056                                     Register tmp7, Register tmp8,
4057                                     Register tmp9, Register tmp10,
4058                                     Register tmp11, Register tmp12,
4059                                     Register tmp13) {
4060
4061  ShortBranchVerifier sbv(this);
4062
4063  assert_different_registers(x, xlen, y, ylen, z, zlen,
4064                             tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4065  assert_different_registers(x, xlen, y, ylen, z, zlen,
4066                             tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4067  assert_different_registers(x, xlen, y, ylen, z, zlen,
4068                             tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4069
4070  const Register idx = tmp1;
4071  const Register kdx = tmp2;
4072  const Register xstart = tmp3;
4073
4074  const Register y_idx = tmp4;
4075  const Register carry = tmp5;
4076  const Register product = tmp6;
4077  const Register product_high = tmp7;
4078  const Register x_xstart = tmp8;
4079  const Register tmp = tmp9;
4080
4081  // First Loop.
4082  //
4083  //  final static long LONG_MASK = 0xffffffffL;
4084  //  int xstart = xlen - 1;
4085  //  int ystart = ylen - 1;
4086  //  long carry = 0;
4087  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4088  //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4089  //    z[kdx] = (int)product;
4090  //    carry = product >>> 32;
4091  //  }
4092  //  z[xstart] = (int)carry;
4093
4094  mr_if_needed(idx, ylen);        // idx = ylen
4095  mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4096  li(carry, 0);                   // carry = 0
4097
4098  Label L_done;
4099
4100  addic_(xstart, xlen, -1);
4101  blt(CCR0, L_done);
4102
4103  multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4104                        carry, product_high, product, idx, kdx, tmp);
4105
4106  Label L_second_loop;
4107
4108  cmpdi(CCR0, kdx, 0);
4109  beq(CCR0, L_second_loop);
4110
4111  Label L_carry;
4112
4113  addic_(kdx, kdx, -1);
4114  beq(CCR0, L_carry);
4115
4116  // Store lower 32 bits of carry.
4117  sldi(tmp, kdx, LogBytesPerInt);
4118  stwx(carry, z, tmp);
4119  srdi(carry, carry, 32);
4120  addi(kdx, kdx, -1);
4121
4122
4123  bind(L_carry);
4124
4125  // Store upper 32 bits of carry.
4126  sldi(tmp, kdx, LogBytesPerInt);
4127  stwx(carry, z, tmp);
4128
4129  // Second and third (nested) loops.
4130  //
4131  //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4132  //    carry = 0;
4133  //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4134  //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4135  //                     (z[k] & LONG_MASK) + carry;
4136  //      z[k] = (int)product;
4137  //      carry = product >>> 32;
4138  //    }
4139  //    z[i] = (int)carry;
4140  //  }
4141  //
4142  //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4143
4144  bind(L_second_loop);
4145
4146  li(carry, 0);                   // carry = 0;
4147
4148  addic_(xstart, xstart, -1);     // i = xstart-1;
4149  blt(CCR0, L_done);
4150
4151  Register zsave = tmp10;
4152
4153  mr(zsave, z);
4154
4155
4156  Label L_last_x;
4157
4158  sldi(tmp, xstart, LogBytesPerInt);
4159  add(z, z, tmp);                 // z = z + k - j
4160  addi(z, z, 4);
4161  addic_(xstart, xstart, -1);     // i = xstart-1;
4162  blt(CCR0, L_last_x);
4163
4164  sldi(tmp, xstart, LogBytesPerInt);
4165  ldx(x_xstart, x, tmp);
4166#ifdef VM_LITTLE_ENDIAN
4167  rldicl(x_xstart, x_xstart, 32, 0);
4168#endif
4169
4170
4171  Label L_third_loop_prologue;
4172
4173  bind(L_third_loop_prologue);
4174
4175  Register xsave = tmp11;
4176  Register xlensave = tmp12;
4177  Register ylensave = tmp13;
4178
4179  mr(xsave, x);
4180  mr(xlensave, xstart);
4181  mr(ylensave, ylen);
4182
4183
4184  multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4185                          carry, product_high, product, x, tmp);
4186
4187  mr(z, zsave);
4188  mr(x, xsave);
4189  mr(xlen, xlensave);   // This is the decrement of the loop counter!
4190  mr(ylen, ylensave);
4191
4192  addi(tmp3, xlen, 1);
4193  sldi(tmp, tmp3, LogBytesPerInt);
4194  stwx(carry, z, tmp);
4195  addic_(tmp3, tmp3, -1);
4196  blt(CCR0, L_done);
4197
4198  srdi(carry, carry, 32);
4199  sldi(tmp, tmp3, LogBytesPerInt);
4200  stwx(carry, z, tmp);
4201  b(L_second_loop);
4202
4203  // Next infrequent code is moved outside loops.
4204  bind(L_last_x);
4205
4206  lwz(x_xstart, 0, x);
4207  b(L_third_loop_prologue);
4208
4209  bind(L_done);
4210}   // multiply_to_len
4211
4212void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
4213#ifdef ASSERT
4214  Label ok;
4215  if (check_equal) {
4216    beq(CCR0, ok);
4217  } else {
4218    bne(CCR0, ok);
4219  }
4220  stop(msg, id);
4221  bind(ok);
4222#endif
4223}
4224
4225void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4226                                          Register mem_base, const char* msg, int id) {
4227#ifdef ASSERT
4228  switch (size) {
4229    case 4:
4230      lwz(R0, mem_offset, mem_base);
4231      cmpwi(CCR0, R0, 0);
4232      break;
4233    case 8:
4234      ld(R0, mem_offset, mem_base);
4235      cmpdi(CCR0, R0, 0);
4236      break;
4237    default:
4238      ShouldNotReachHere();
4239  }
4240  asm_assert(check_equal, msg, id);
4241#endif // ASSERT
4242}
4243
4244void MacroAssembler::verify_thread() {
4245  if (VerifyThread) {
4246    unimplemented("'VerifyThread' currently not implemented on PPC");
4247  }
4248}
4249
4250// READ: oop. KILL: R0. Volatile floats perhaps.
4251void MacroAssembler::verify_oop(Register oop, const char* msg) {
4252  if (!VerifyOops) {
4253    return;
4254  }
4255
4256  address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4257  const Register tmp = R11; // Will be preserved.
4258  const int nbytes_save = 11*8; // Volatile gprs except R0.
4259  save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4260
4261  if (oop == tmp) mr(R4_ARG2, oop);
4262  save_LR_CR(tmp); // save in old frame
4263  push_frame_reg_args(nbytes_save, tmp);
4264  // load FunctionDescriptor** / entry_address *
4265  load_const_optimized(tmp, fd, R0);
4266  // load FunctionDescriptor* / entry_address
4267  ld(tmp, 0, tmp);
4268  if (oop != tmp) mr_if_needed(R4_ARG2, oop);
4269  load_const_optimized(R3_ARG1, (address)msg, R0);
4270  // Call destination for its side effect.
4271  call_c(tmp);
4272
4273  pop_frame();
4274  restore_LR_CR(tmp);
4275  restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4276}
4277
4278const char* stop_types[] = {
4279  "stop",
4280  "untested",
4281  "unimplemented",
4282  "shouldnotreachhere"
4283};
4284
4285static void stop_on_request(int tp, const char* msg) {
4286  tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
4287  guarantee(false, "PPC assembly code requires stop: %s", msg);
4288}
4289
4290// Call a C-function that prints output.
4291void MacroAssembler::stop(int type, const char* msg, int id) {
4292#ifndef PRODUCT
4293  block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
4294#else
4295  block_comment("stop {");
4296#endif
4297
4298  // setup arguments
4299  load_const_optimized(R3_ARG1, type);
4300  load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
4301  call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
4302  illtrap();
4303  emit_int32(id);
4304  block_comment("} stop;");
4305}
4306
4307#ifndef PRODUCT
4308// Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4309// Val, addr are temp registers.
4310// If low == addr, addr is killed.
4311// High is preserved.
4312void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4313  if (!ZapMemory) return;
4314
4315  assert_different_registers(low, val);
4316
4317  BLOCK_COMMENT("zap memory region {");
4318  load_const_optimized(val, 0x0101010101010101);
4319  int size = before + after;
4320  if (low == high && size < 5 && size > 0) {
4321    int offset = -before*BytesPerWord;
4322    for (int i = 0; i < size; ++i) {
4323      std(val, offset, low);
4324      offset += (1*BytesPerWord);
4325    }
4326  } else {
4327    addi(addr, low, -before*BytesPerWord);
4328    assert_different_registers(high, val);
4329    if (after) addi(high, high, after * BytesPerWord);
4330    Label loop;
4331    bind(loop);
4332    std(val, 0, addr);
4333    addi(addr, addr, 8);
4334    cmpd(CCR6, addr, high);
4335    ble(CCR6, loop);
4336    if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4337  }
4338  BLOCK_COMMENT("} zap memory region");
4339}
4340
4341#endif // !PRODUCT
4342
4343SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4344  int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4345  assert(sizeof(bool) == 1, "PowerPC ABI");
4346  masm->lbz(temp, simm16_offset, temp);
4347  masm->cmpwi(CCR0, temp, 0);
4348  masm->beq(CCR0, _label);
4349}
4350
4351SkipIfEqualZero::~SkipIfEqualZero() {
4352  _masm->bind(_label);
4353}
4354