1/*
2 * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26#include <sys/types.h>
27
28#include "precompiled.hpp"
29#include "asm/assembler.hpp"
30#include "asm/assembler.inline.hpp"
31#include "interpreter/interpreter.hpp"
32
33#include "compiler/disassembler.hpp"
34#include "memory/resourceArea.hpp"
35#include "nativeInst_aarch64.hpp"
36#include "oops/klass.inline.hpp"
37#include "oops/oop.inline.hpp"
38#include "opto/compile.hpp"
39#include "opto/intrinsicnode.hpp"
40#include "opto/node.hpp"
41#include "runtime/biasedLocking.hpp"
42#include "runtime/icache.hpp"
43#include "runtime/interfaceSupport.hpp"
44#include "runtime/sharedRuntime.hpp"
45#include "runtime/thread.hpp"
46
47#if INCLUDE_ALL_GCS
48#include "gc/g1/g1CollectedHeap.inline.hpp"
49#include "gc/g1/g1SATBCardTableModRefBS.hpp"
50#include "gc/g1/heapRegion.hpp"
51#endif
52
53#ifdef PRODUCT
54#define BLOCK_COMMENT(str) /* nothing */
55#define STOP(error) stop(error)
56#else
57#define BLOCK_COMMENT(str) block_comment(str)
58#define STOP(error) block_comment(error); stop(error)
59#endif
60
61#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
62
63// Patch any kind of instruction; there may be several instructions.
64// Return the total length (in bytes) of the instructions.
65int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
66  int instructions = 1;
67  assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
68  long offset = (target - branch) >> 2;
69  unsigned insn = *(unsigned*)branch;
70  if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
71    // Load register (literal)
72    Instruction_aarch64::spatch(branch, 23, 5, offset);
73  } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
74    // Unconditional branch (immediate)
75    Instruction_aarch64::spatch(branch, 25, 0, offset);
76  } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
77    // Conditional branch (immediate)
78    Instruction_aarch64::spatch(branch, 23, 5, offset);
79  } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
80    // Compare & branch (immediate)
81    Instruction_aarch64::spatch(branch, 23, 5, offset);
82  } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
83    // Test & branch (immediate)
84    Instruction_aarch64::spatch(branch, 18, 5, offset);
85  } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
86    // PC-rel. addressing
87    offset = target-branch;
88    int shift = Instruction_aarch64::extract(insn, 31, 31);
89    if (shift) {
90      u_int64_t dest = (u_int64_t)target;
91      uint64_t pc_page = (uint64_t)branch >> 12;
92      uint64_t adr_page = (uint64_t)target >> 12;
93      unsigned offset_lo = dest & 0xfff;
94      offset = adr_page - pc_page;
95
96      // We handle 4 types of PC relative addressing
97      //   1 - adrp    Rx, target_page
98      //       ldr/str Ry, [Rx, #offset_in_page]
99      //   2 - adrp    Rx, target_page
100      //       add     Ry, Rx, #offset_in_page
101      //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
102      //       movk    Rx, #imm16<<32
103      //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
104      // In the first 3 cases we must check that Rx is the same in the adrp and the
105      // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
106      // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
107      // to be followed by a random unrelated ldr/str, add or movk instruction.
108      //
109      unsigned insn2 = ((unsigned*)branch)[1];
110      if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
111                Instruction_aarch64::extract(insn, 4, 0) ==
112                        Instruction_aarch64::extract(insn2, 9, 5)) {
113        // Load/store register (unsigned immediate)
114        unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
115        Instruction_aarch64::patch(branch + sizeof (unsigned),
116                                    21, 10, offset_lo >> size);
117        guarantee(((dest >> size) << size) == dest, "misaligned target");
118        instructions = 2;
119      } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
120                Instruction_aarch64::extract(insn, 4, 0) ==
121                        Instruction_aarch64::extract(insn2, 4, 0)) {
122        // add (immediate)
123        Instruction_aarch64::patch(branch + sizeof (unsigned),
124                                   21, 10, offset_lo);
125        instructions = 2;
126      } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
127                   Instruction_aarch64::extract(insn, 4, 0) ==
128                     Instruction_aarch64::extract(insn2, 4, 0)) {
129        // movk #imm16<<32
130        Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
131        long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
132        long pc_page = (long)branch >> 12;
133        long adr_page = (long)dest >> 12;
134        offset = adr_page - pc_page;
135        instructions = 2;
136      }
137    }
138    int offset_lo = offset & 3;
139    offset >>= 2;
140    Instruction_aarch64::spatch(branch, 23, 5, offset);
141    Instruction_aarch64::patch(branch, 30, 29, offset_lo);
142  } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
143    u_int64_t dest = (u_int64_t)target;
144    // Move wide constant
145    assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
146    assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
147    Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
148    Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
149    Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
150    assert(target_addr_for_insn(branch) == target, "should be");
151    instructions = 3;
152  } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
153             Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
154    // nothing to do
155    assert(target == 0, "did not expect to relocate target for polling page load");
156  } else {
157    ShouldNotReachHere();
158  }
159  return instructions * NativeInstruction::instruction_size;
160}
161
162int MacroAssembler::patch_oop(address insn_addr, address o) {
163  int instructions;
164  unsigned insn = *(unsigned*)insn_addr;
165  assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
166
167  // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
168  // narrow OOPs by setting the upper 16 bits in the first
169  // instruction.
170  if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
171    // Move narrow OOP
172    narrowOop n = oopDesc::encode_heap_oop((oop)o);
173    Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
174    Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
175    instructions = 2;
176  } else {
177    // Move wide OOP
178    assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
179    uintptr_t dest = (uintptr_t)o;
180    Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
181    Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
182    Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
183    instructions = 3;
184  }
185  return instructions * NativeInstruction::instruction_size;
186}
187
188int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
189  // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
190  // We encode narrow ones by setting the upper 16 bits in the first
191  // instruction.
192  NativeInstruction *insn = nativeInstruction_at(insn_addr);
193  assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
194         nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
195
196  Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
197  Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
198  return 2 * NativeInstruction::instruction_size;
199}
200
201address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
202  long offset = 0;
203  if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
204    // Load register (literal)
205    offset = Instruction_aarch64::sextract(insn, 23, 5);
206    return address(((uint64_t)insn_addr + (offset << 2)));
207  } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
208    // Unconditional branch (immediate)
209    offset = Instruction_aarch64::sextract(insn, 25, 0);
210  } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
211    // Conditional branch (immediate)
212    offset = Instruction_aarch64::sextract(insn, 23, 5);
213  } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
214    // Compare & branch (immediate)
215    offset = Instruction_aarch64::sextract(insn, 23, 5);
216   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
217    // Test & branch (immediate)
218    offset = Instruction_aarch64::sextract(insn, 18, 5);
219  } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
220    // PC-rel. addressing
221    offset = Instruction_aarch64::extract(insn, 30, 29);
222    offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
223    int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
224    if (shift) {
225      offset <<= shift;
226      uint64_t target_page = ((uint64_t)insn_addr) + offset;
227      target_page &= ((uint64_t)-1) << shift;
228      // Return the target address for the following sequences
229      //   1 - adrp    Rx, target_page
230      //       ldr/str Ry, [Rx, #offset_in_page]
231      //   2 - adrp    Rx, target_page
232      //       add     Ry, Rx, #offset_in_page
233      //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
234      //       movk    Rx, #imm12<<32
235      //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
236      //
237      // In the first two cases  we check that the register is the same and
238      // return the target_page + the offset within the page.
239      // Otherwise we assume it is a page aligned relocation and return
240      // the target page only.
241      //
242      unsigned insn2 = ((unsigned*)insn_addr)[1];
243      if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
244                Instruction_aarch64::extract(insn, 4, 0) ==
245                        Instruction_aarch64::extract(insn2, 9, 5)) {
246        // Load/store register (unsigned immediate)
247        unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
248        unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
249        return address(target_page + (byte_offset << size));
250      } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
251                Instruction_aarch64::extract(insn, 4, 0) ==
252                        Instruction_aarch64::extract(insn2, 4, 0)) {
253        // add (immediate)
254        unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
255        return address(target_page + byte_offset);
256      } else {
257        if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
258               Instruction_aarch64::extract(insn, 4, 0) ==
259                 Instruction_aarch64::extract(insn2, 4, 0)) {
260          target_page = (target_page & 0xffffffff) |
261                         ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
262        }
263        return (address)target_page;
264      }
265    } else {
266      ShouldNotReachHere();
267    }
268  } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
269    u_int32_t *insns = (u_int32_t *)insn_addr;
270    // Move wide constant: movz, movk, movk.  See movptr().
271    assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
272    assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
273    return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
274                   + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
275                   + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
276  } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
277             Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
278    return 0;
279  } else {
280    ShouldNotReachHere();
281  }
282  return address(((uint64_t)insn_addr + (offset << 2)));
283}
284
285void MacroAssembler::serialize_memory(Register thread, Register tmp) {
286  dsb(Assembler::SY);
287}
288
289
290void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
291  // we must set sp to zero to clear frame
292  str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
293
294  // must clear fp, so that compiled frames are not confused; it is
295  // possible that we need it only for debugging
296  if (clear_fp) {
297    str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
298  }
299
300  // Always clear the pc because it could have been set by make_walkable()
301  str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
302}
303
304// Calls to C land
305//
306// When entering C land, the rfp, & resp of the last Java frame have to be recorded
307// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
308// has to be reset to 0. This is required to allow proper stack traversal.
309void MacroAssembler::set_last_Java_frame(Register last_java_sp,
310                                         Register last_java_fp,
311                                         Register last_java_pc,
312                                         Register scratch) {
313
314  if (last_java_pc->is_valid()) {
315      str(last_java_pc, Address(rthread,
316                                JavaThread::frame_anchor_offset()
317                                + JavaFrameAnchor::last_Java_pc_offset()));
318    }
319
320  // determine last_java_sp register
321  if (last_java_sp == sp) {
322    mov(scratch, sp);
323    last_java_sp = scratch;
324  } else if (!last_java_sp->is_valid()) {
325    last_java_sp = esp;
326  }
327
328  str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
329
330  // last_java_fp is optional
331  if (last_java_fp->is_valid()) {
332    str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
333  }
334}
335
336void MacroAssembler::set_last_Java_frame(Register last_java_sp,
337                                         Register last_java_fp,
338                                         address  last_java_pc,
339                                         Register scratch) {
340  if (last_java_pc != NULL) {
341    adr(scratch, last_java_pc);
342  } else {
343    // FIXME: This is almost never correct.  We should delete all
344    // cases of set_last_Java_frame with last_java_pc=NULL and use the
345    // correct return address instead.
346    adr(scratch, pc());
347  }
348
349  str(scratch, Address(rthread,
350                       JavaThread::frame_anchor_offset()
351                       + JavaFrameAnchor::last_Java_pc_offset()));
352
353  set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
354}
355
356void MacroAssembler::set_last_Java_frame(Register last_java_sp,
357                                         Register last_java_fp,
358                                         Label &L,
359                                         Register scratch) {
360  if (L.is_bound()) {
361    set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
362  } else {
363    InstructionMark im(this);
364    L.add_patch_at(code(), locator());
365    set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
366  }
367}
368
369void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
370  assert(ReservedCodeCacheSize < 4*G, "branch out of range");
371  assert(CodeCache::find_blob(entry.target()) != NULL,
372         "destination of far call not found in code cache");
373  if (far_branches()) {
374    unsigned long offset;
375    // We can use ADRP here because we know that the total size of
376    // the code cache cannot exceed 2Gb.
377    adrp(tmp, entry, offset);
378    add(tmp, tmp, offset);
379    if (cbuf) cbuf->set_insts_mark();
380    blr(tmp);
381  } else {
382    if (cbuf) cbuf->set_insts_mark();
383    bl(entry);
384  }
385}
386
387void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
388  assert(ReservedCodeCacheSize < 4*G, "branch out of range");
389  assert(CodeCache::find_blob(entry.target()) != NULL,
390         "destination of far call not found in code cache");
391  if (far_branches()) {
392    unsigned long offset;
393    // We can use ADRP here because we know that the total size of
394    // the code cache cannot exceed 2Gb.
395    adrp(tmp, entry, offset);
396    add(tmp, tmp, offset);
397    if (cbuf) cbuf->set_insts_mark();
398    br(tmp);
399  } else {
400    if (cbuf) cbuf->set_insts_mark();
401    b(entry);
402  }
403}
404
405void MacroAssembler::reserved_stack_check() {
406    // testing if reserved zone needs to be enabled
407    Label no_reserved_zone_enabling;
408
409    ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
410    cmp(sp, rscratch1);
411    br(Assembler::LO, no_reserved_zone_enabling);
412
413    enter();   // LR and FP are live.
414    lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
415    mov(c_rarg0, rthread);
416    blr(rscratch1);
417    leave();
418
419    // We have already removed our own frame.
420    // throw_delayed_StackOverflowError will think that it's been
421    // called by our caller.
422    lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
423    br(rscratch1);
424    should_not_reach_here();
425
426    bind(no_reserved_zone_enabling);
427}
428
429int MacroAssembler::biased_locking_enter(Register lock_reg,
430                                         Register obj_reg,
431                                         Register swap_reg,
432                                         Register tmp_reg,
433                                         bool swap_reg_contains_mark,
434                                         Label& done,
435                                         Label* slow_case,
436                                         BiasedLockingCounters* counters) {
437  assert(UseBiasedLocking, "why call this otherwise?");
438  assert_different_registers(lock_reg, obj_reg, swap_reg);
439
440  if (PrintBiasedLockingStatistics && counters == NULL)
441    counters = BiasedLocking::counters();
442
443  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
444  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
445  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
446  Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
447  Address saved_mark_addr(lock_reg, 0);
448
449  // Biased locking
450  // See whether the lock is currently biased toward our thread and
451  // whether the epoch is still valid
452  // Note that the runtime guarantees sufficient alignment of JavaThread
453  // pointers to allow age to be placed into low bits
454  // First check to see whether biasing is even enabled for this object
455  Label cas_label;
456  int null_check_offset = -1;
457  if (!swap_reg_contains_mark) {
458    null_check_offset = offset();
459    ldr(swap_reg, mark_addr);
460  }
461  andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
462  cmp(tmp_reg, markOopDesc::biased_lock_pattern);
463  br(Assembler::NE, cas_label);
464  // The bias pattern is present in the object's header. Need to check
465  // whether the bias owner and the epoch are both still current.
466  load_prototype_header(tmp_reg, obj_reg);
467  orr(tmp_reg, tmp_reg, rthread);
468  eor(tmp_reg, swap_reg, tmp_reg);
469  andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
470  if (counters != NULL) {
471    Label around;
472    cbnz(tmp_reg, around);
473    atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
474    b(done);
475    bind(around);
476  } else {
477    cbz(tmp_reg, done);
478  }
479
480  Label try_revoke_bias;
481  Label try_rebias;
482
483  // At this point we know that the header has the bias pattern and
484  // that we are not the bias owner in the current epoch. We need to
485  // figure out more details about the state of the header in order to
486  // know what operations can be legally performed on the object's
487  // header.
488
489  // If the low three bits in the xor result aren't clear, that means
490  // the prototype header is no longer biased and we have to revoke
491  // the bias on this object.
492  andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
493  cbnz(rscratch1, try_revoke_bias);
494
495  // Biasing is still enabled for this data type. See whether the
496  // epoch of the current bias is still valid, meaning that the epoch
497  // bits of the mark word are equal to the epoch bits of the
498  // prototype header. (Note that the prototype header's epoch bits
499  // only change at a safepoint.) If not, attempt to rebias the object
500  // toward the current thread. Note that we must be absolutely sure
501  // that the current epoch is invalid in order to do this because
502  // otherwise the manipulations it performs on the mark word are
503  // illegal.
504  andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
505  cbnz(rscratch1, try_rebias);
506
507  // The epoch of the current bias is still valid but we know nothing
508  // about the owner; it might be set or it might be clear. Try to
509  // acquire the bias of the object using an atomic operation. If this
510  // fails we will go in to the runtime to revoke the object's bias.
511  // Note that we first construct the presumed unbiased header so we
512  // don't accidentally blow away another thread's valid bias.
513  {
514    Label here;
515    mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
516    andr(swap_reg, swap_reg, rscratch1);
517    orr(tmp_reg, swap_reg, rthread);
518    cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
519    // If the biasing toward our thread failed, this means that
520    // another thread succeeded in biasing it toward itself and we
521    // need to revoke that bias. The revocation will occur in the
522    // interpreter runtime in the slow case.
523    bind(here);
524    if (counters != NULL) {
525      atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
526                  tmp_reg, rscratch1, rscratch2);
527    }
528  }
529  b(done);
530
531  bind(try_rebias);
532  // At this point we know the epoch has expired, meaning that the
533  // current "bias owner", if any, is actually invalid. Under these
534  // circumstances _only_, we are allowed to use the current header's
535  // value as the comparison value when doing the cas to acquire the
536  // bias in the current epoch. In other words, we allow transfer of
537  // the bias from one thread to another directly in this situation.
538  //
539  // FIXME: due to a lack of registers we currently blow away the age
540  // bits in this situation. Should attempt to preserve them.
541  {
542    Label here;
543    load_prototype_header(tmp_reg, obj_reg);
544    orr(tmp_reg, rthread, tmp_reg);
545    cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
546    // If the biasing toward our thread failed, then another thread
547    // succeeded in biasing it toward itself and we need to revoke that
548    // bias. The revocation will occur in the runtime in the slow case.
549    bind(here);
550    if (counters != NULL) {
551      atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
552                  tmp_reg, rscratch1, rscratch2);
553    }
554  }
555  b(done);
556
557  bind(try_revoke_bias);
558  // The prototype mark in the klass doesn't have the bias bit set any
559  // more, indicating that objects of this data type are not supposed
560  // to be biased any more. We are going to try to reset the mark of
561  // this object to the prototype value and fall through to the
562  // CAS-based locking scheme. Note that if our CAS fails, it means
563  // that another thread raced us for the privilege of revoking the
564  // bias of this particular object, so it's okay to continue in the
565  // normal locking code.
566  //
567  // FIXME: due to a lack of registers we currently blow away the age
568  // bits in this situation. Should attempt to preserve them.
569  {
570    Label here, nope;
571    load_prototype_header(tmp_reg, obj_reg);
572    cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
573    bind(here);
574
575    // Fall through to the normal CAS-based lock, because no matter what
576    // the result of the above CAS, some thread must have succeeded in
577    // removing the bias bit from the object's header.
578    if (counters != NULL) {
579      atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
580                  rscratch1, rscratch2);
581    }
582    bind(nope);
583  }
584
585  bind(cas_label);
586
587  return null_check_offset;
588}
589
590void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
591  assert(UseBiasedLocking, "why call this otherwise?");
592
593  // Check for biased locking unlock case, which is a no-op
594  // Note: we do not have to check the thread ID for two reasons.
595  // First, the interpreter checks for IllegalMonitorStateException at
596  // a higher level. Second, if the bias was revoked while we held the
597  // lock, the object could not be rebiased toward another thread, so
598  // the bias bit would be clear.
599  ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
600  andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
601  cmp(temp_reg, markOopDesc::biased_lock_pattern);
602  br(Assembler::EQ, done);
603}
604
605static void pass_arg0(MacroAssembler* masm, Register arg) {
606  if (c_rarg0 != arg ) {
607    masm->mov(c_rarg0, arg);
608  }
609}
610
611static void pass_arg1(MacroAssembler* masm, Register arg) {
612  if (c_rarg1 != arg ) {
613    masm->mov(c_rarg1, arg);
614  }
615}
616
617static void pass_arg2(MacroAssembler* masm, Register arg) {
618  if (c_rarg2 != arg ) {
619    masm->mov(c_rarg2, arg);
620  }
621}
622
623static void pass_arg3(MacroAssembler* masm, Register arg) {
624  if (c_rarg3 != arg ) {
625    masm->mov(c_rarg3, arg);
626  }
627}
628
629void MacroAssembler::call_VM_base(Register oop_result,
630                                  Register java_thread,
631                                  Register last_java_sp,
632                                  address  entry_point,
633                                  int      number_of_arguments,
634                                  bool     check_exceptions) {
635   // determine java_thread register
636  if (!java_thread->is_valid()) {
637    java_thread = rthread;
638  }
639
640  // determine last_java_sp register
641  if (!last_java_sp->is_valid()) {
642    last_java_sp = esp;
643  }
644
645  // debugging support
646  assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
647  assert(java_thread == rthread, "unexpected register");
648#ifdef ASSERT
649  // TraceBytecodes does not use r12 but saves it over the call, so don't verify
650  // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
651#endif // ASSERT
652
653  assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
654  assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
655
656  // push java thread (becomes first argument of C function)
657
658  mov(c_rarg0, java_thread);
659
660  // set last Java frame before call
661  assert(last_java_sp != rfp, "can't use rfp");
662
663  Label l;
664  set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
665
666  // do the call, remove parameters
667  MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
668
669  // reset last Java frame
670  // Only interpreter should have to clear fp
671  reset_last_Java_frame(true);
672
673   // C++ interp handles this in the interpreter
674  check_and_handle_popframe(java_thread);
675  check_and_handle_earlyret(java_thread);
676
677  if (check_exceptions) {
678    // check for pending exceptions (java_thread is set upon return)
679    ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
680    Label ok;
681    cbz(rscratch1, ok);
682    lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
683    br(rscratch1);
684    bind(ok);
685  }
686
687  // get oop result if there is one and reset the value in the thread
688  if (oop_result->is_valid()) {
689    get_vm_result(oop_result, java_thread);
690  }
691}
692
693void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
694  call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
695}
696
697// Maybe emit a call via a trampoline.  If the code cache is small
698// trampolines won't be emitted.
699
700address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
701  assert(entry.rspec().type() == relocInfo::runtime_call_type
702         || entry.rspec().type() == relocInfo::opt_virtual_call_type
703         || entry.rspec().type() == relocInfo::static_call_type
704         || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
705
706  unsigned int start_offset = offset();
707  if (far_branches() && !Compile::current()->in_scratch_emit_size()) {
708    address stub = emit_trampoline_stub(start_offset, entry.target());
709    if (stub == NULL) {
710      return NULL; // CodeCache is full
711    }
712  }
713
714  if (cbuf) cbuf->set_insts_mark();
715  relocate(entry.rspec());
716  if (!far_branches()) {
717    bl(entry.target());
718  } else {
719    bl(pc());
720  }
721  // just need to return a non-null address
722  return pc();
723}
724
725
726// Emit a trampoline stub for a call to a target which is too far away.
727//
728// code sequences:
729//
730// call-site:
731//   branch-and-link to <destination> or <trampoline stub>
732//
733// Related trampoline stub for this call site in the stub section:
734//   load the call target from the constant pool
735//   branch (LR still points to the call site above)
736
737address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
738                                             address dest) {
739  address stub = start_a_stub(Compile::MAX_stubs_size/2);
740  if (stub == NULL) {
741    return NULL;  // CodeBuffer::expand failed
742  }
743
744  // Create a trampoline stub relocation which relates this trampoline stub
745  // with the call instruction at insts_call_instruction_offset in the
746  // instructions code-section.
747  align(wordSize);
748  relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
749                                            + insts_call_instruction_offset));
750  const int stub_start_offset = offset();
751
752  // Now, create the trampoline stub's code:
753  // - load the call
754  // - call
755  Label target;
756  ldr(rscratch1, target);
757  br(rscratch1);
758  bind(target);
759  assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
760         "should be");
761  emit_int64((int64_t)dest);
762
763  const address stub_start_addr = addr_at(stub_start_offset);
764
765  assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
766
767  end_a_stub();
768  return stub;
769}
770
771address MacroAssembler::ic_call(address entry, jint method_index) {
772  RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
773  // address const_ptr = long_constant((jlong)Universe::non_oop_word());
774  // unsigned long offset;
775  // ldr_constant(rscratch2, const_ptr);
776  movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
777  return trampoline_call(Address(entry, rh));
778}
779
780// Implementation of call_VM versions
781
782void MacroAssembler::call_VM(Register oop_result,
783                             address entry_point,
784                             bool check_exceptions) {
785  call_VM_helper(oop_result, entry_point, 0, check_exceptions);
786}
787
788void MacroAssembler::call_VM(Register oop_result,
789                             address entry_point,
790                             Register arg_1,
791                             bool check_exceptions) {
792  pass_arg1(this, arg_1);
793  call_VM_helper(oop_result, entry_point, 1, check_exceptions);
794}
795
796void MacroAssembler::call_VM(Register oop_result,
797                             address entry_point,
798                             Register arg_1,
799                             Register arg_2,
800                             bool check_exceptions) {
801  assert(arg_1 != c_rarg2, "smashed arg");
802  pass_arg2(this, arg_2);
803  pass_arg1(this, arg_1);
804  call_VM_helper(oop_result, entry_point, 2, check_exceptions);
805}
806
807void MacroAssembler::call_VM(Register oop_result,
808                             address entry_point,
809                             Register arg_1,
810                             Register arg_2,
811                             Register arg_3,
812                             bool check_exceptions) {
813  assert(arg_1 != c_rarg3, "smashed arg");
814  assert(arg_2 != c_rarg3, "smashed arg");
815  pass_arg3(this, arg_3);
816
817  assert(arg_1 != c_rarg2, "smashed arg");
818  pass_arg2(this, arg_2);
819
820  pass_arg1(this, arg_1);
821  call_VM_helper(oop_result, entry_point, 3, check_exceptions);
822}
823
824void MacroAssembler::call_VM(Register oop_result,
825                             Register last_java_sp,
826                             address entry_point,
827                             int number_of_arguments,
828                             bool check_exceptions) {
829  call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
830}
831
832void MacroAssembler::call_VM(Register oop_result,
833                             Register last_java_sp,
834                             address entry_point,
835                             Register arg_1,
836                             bool check_exceptions) {
837  pass_arg1(this, arg_1);
838  call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
839}
840
841void MacroAssembler::call_VM(Register oop_result,
842                             Register last_java_sp,
843                             address entry_point,
844                             Register arg_1,
845                             Register arg_2,
846                             bool check_exceptions) {
847
848  assert(arg_1 != c_rarg2, "smashed arg");
849  pass_arg2(this, arg_2);
850  pass_arg1(this, arg_1);
851  call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
852}
853
854void MacroAssembler::call_VM(Register oop_result,
855                             Register last_java_sp,
856                             address entry_point,
857                             Register arg_1,
858                             Register arg_2,
859                             Register arg_3,
860                             bool check_exceptions) {
861  assert(arg_1 != c_rarg3, "smashed arg");
862  assert(arg_2 != c_rarg3, "smashed arg");
863  pass_arg3(this, arg_3);
864  assert(arg_1 != c_rarg2, "smashed arg");
865  pass_arg2(this, arg_2);
866  pass_arg1(this, arg_1);
867  call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
868}
869
870
871void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
872  ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
873  str(zr, Address(java_thread, JavaThread::vm_result_offset()));
874  verify_oop(oop_result, "broken oop in call_VM_base");
875}
876
877void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
878  ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
879  str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
880}
881
882void MacroAssembler::align(int modulus) {
883  while (offset() % modulus != 0) nop();
884}
885
886// these are no-ops overridden by InterpreterMacroAssembler
887
888void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
889
890void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
891
892
893RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
894                                                      Register tmp,
895                                                      int offset) {
896  intptr_t value = *delayed_value_addr;
897  if (value != 0)
898    return RegisterOrConstant(value + offset);
899
900  // load indirectly to solve generation ordering problem
901  ldr(tmp, ExternalAddress((address) delayed_value_addr));
902
903  if (offset != 0)
904    add(tmp, tmp, offset);
905
906  return RegisterOrConstant(tmp);
907}
908
909
910void MacroAssembler:: notify(int type) {
911  if (type == bytecode_start) {
912    // set_last_Java_frame(esp, rfp, (address)NULL);
913    Assembler:: notify(type);
914    // reset_last_Java_frame(true);
915  }
916  else
917    Assembler:: notify(type);
918}
919
920// Look up the method for a megamorphic invokeinterface call.
921// The target method is determined by <intf_klass, itable_index>.
922// The receiver klass is in recv_klass.
923// On success, the result will be in method_result, and execution falls through.
924// On failure, execution transfers to the given label.
925void MacroAssembler::lookup_interface_method(Register recv_klass,
926                                             Register intf_klass,
927                                             RegisterOrConstant itable_index,
928                                             Register method_result,
929                                             Register scan_temp,
930                                             Label& L_no_such_interface) {
931  assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
932  assert(itable_index.is_constant() || itable_index.as_register() == method_result,
933         "caller must use same register for non-constant itable index as for method");
934
935  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
936  int vtable_base = in_bytes(Klass::vtable_start_offset());
937  int itentry_off = itableMethodEntry::method_offset_in_bytes();
938  int scan_step   = itableOffsetEntry::size() * wordSize;
939  int vte_size    = vtableEntry::size_in_bytes();
940  assert(vte_size == wordSize, "else adjust times_vte_scale");
941
942  ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
943
944  // %%% Could store the aligned, prescaled offset in the klassoop.
945  // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
946  lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
947  add(scan_temp, scan_temp, vtable_base);
948
949  // Adjust recv_klass by scaled itable_index, so we can free itable_index.
950  assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
951  // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
952  lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
953  if (itentry_off)
954    add(recv_klass, recv_klass, itentry_off);
955
956  // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
957  //   if (scan->interface() == intf) {
958  //     result = (klass + scan->offset() + itable_index);
959  //   }
960  // }
961  Label search, found_method;
962
963  for (int peel = 1; peel >= 0; peel--) {
964    ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
965    cmp(intf_klass, method_result);
966
967    if (peel) {
968      br(Assembler::EQ, found_method);
969    } else {
970      br(Assembler::NE, search);
971      // (invert the test to fall through to found_method...)
972    }
973
974    if (!peel)  break;
975
976    bind(search);
977
978    // Check that the previous entry is non-null.  A null entry means that
979    // the receiver class doesn't implement the interface, and wasn't the
980    // same as when the caller was compiled.
981    cbz(method_result, L_no_such_interface);
982    add(scan_temp, scan_temp, scan_step);
983  }
984
985  bind(found_method);
986
987  // Got a hit.
988  ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
989  ldr(method_result, Address(recv_klass, scan_temp));
990}
991
992// virtual method calling
993void MacroAssembler::lookup_virtual_method(Register recv_klass,
994                                           RegisterOrConstant vtable_index,
995                                           Register method_result) {
996  const int base = in_bytes(Klass::vtable_start_offset());
997  assert(vtableEntry::size() * wordSize == 8,
998         "adjust the scaling in the code below");
999  int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1000
1001  if (vtable_index.is_register()) {
1002    lea(method_result, Address(recv_klass,
1003                               vtable_index.as_register(),
1004                               Address::lsl(LogBytesPerWord)));
1005    ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1006  } else {
1007    vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1008    ldr(method_result, Address(recv_klass, vtable_offset_in_bytes));
1009  }
1010}
1011
1012void MacroAssembler::check_klass_subtype(Register sub_klass,
1013                           Register super_klass,
1014                           Register temp_reg,
1015                           Label& L_success) {
1016  Label L_failure;
1017  check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1018  check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1019  bind(L_failure);
1020}
1021
1022
1023void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1024                                                   Register super_klass,
1025                                                   Register temp_reg,
1026                                                   Label* L_success,
1027                                                   Label* L_failure,
1028                                                   Label* L_slow_path,
1029                                        RegisterOrConstant super_check_offset) {
1030  assert_different_registers(sub_klass, super_klass, temp_reg);
1031  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1032  if (super_check_offset.is_register()) {
1033    assert_different_registers(sub_klass, super_klass,
1034                               super_check_offset.as_register());
1035  } else if (must_load_sco) {
1036    assert(temp_reg != noreg, "supply either a temp or a register offset");
1037  }
1038
1039  Label L_fallthrough;
1040  int label_nulls = 0;
1041  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1042  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1043  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1044  assert(label_nulls <= 1, "at most one NULL in the batch");
1045
1046  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1047  int sco_offset = in_bytes(Klass::super_check_offset_offset());
1048  Address super_check_offset_addr(super_klass, sco_offset);
1049
1050  // Hacked jmp, which may only be used just before L_fallthrough.
1051#define final_jmp(label)                                                \
1052  if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1053  else                            b(label)                /*omit semi*/
1054
1055  // If the pointers are equal, we are done (e.g., String[] elements).
1056  // This self-check enables sharing of secondary supertype arrays among
1057  // non-primary types such as array-of-interface.  Otherwise, each such
1058  // type would need its own customized SSA.
1059  // We move this check to the front of the fast path because many
1060  // type checks are in fact trivially successful in this manner,
1061  // so we get a nicely predicted branch right at the start of the check.
1062  cmp(sub_klass, super_klass);
1063  br(Assembler::EQ, *L_success);
1064
1065  // Check the supertype display:
1066  if (must_load_sco) {
1067    ldrw(temp_reg, super_check_offset_addr);
1068    super_check_offset = RegisterOrConstant(temp_reg);
1069  }
1070  Address super_check_addr(sub_klass, super_check_offset);
1071  ldr(rscratch1, super_check_addr);
1072  cmp(super_klass, rscratch1); // load displayed supertype
1073
1074  // This check has worked decisively for primary supers.
1075  // Secondary supers are sought in the super_cache ('super_cache_addr').
1076  // (Secondary supers are interfaces and very deeply nested subtypes.)
1077  // This works in the same check above because of a tricky aliasing
1078  // between the super_cache and the primary super display elements.
1079  // (The 'super_check_addr' can address either, as the case requires.)
1080  // Note that the cache is updated below if it does not help us find
1081  // what we need immediately.
1082  // So if it was a primary super, we can just fail immediately.
1083  // Otherwise, it's the slow path for us (no success at this point).
1084
1085  if (super_check_offset.is_register()) {
1086    br(Assembler::EQ, *L_success);
1087    cmp(super_check_offset.as_register(), sc_offset);
1088    if (L_failure == &L_fallthrough) {
1089      br(Assembler::EQ, *L_slow_path);
1090    } else {
1091      br(Assembler::NE, *L_failure);
1092      final_jmp(*L_slow_path);
1093    }
1094  } else if (super_check_offset.as_constant() == sc_offset) {
1095    // Need a slow path; fast failure is impossible.
1096    if (L_slow_path == &L_fallthrough) {
1097      br(Assembler::EQ, *L_success);
1098    } else {
1099      br(Assembler::NE, *L_slow_path);
1100      final_jmp(*L_success);
1101    }
1102  } else {
1103    // No slow path; it's a fast decision.
1104    if (L_failure == &L_fallthrough) {
1105      br(Assembler::EQ, *L_success);
1106    } else {
1107      br(Assembler::NE, *L_failure);
1108      final_jmp(*L_success);
1109    }
1110  }
1111
1112  bind(L_fallthrough);
1113
1114#undef final_jmp
1115}
1116
1117// These two are taken from x86, but they look generally useful
1118
1119// scans count pointer sized words at [addr] for occurence of value,
1120// generic
1121void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1122                                Register scratch) {
1123  Label Lloop, Lexit;
1124  cbz(count, Lexit);
1125  bind(Lloop);
1126  ldr(scratch, post(addr, wordSize));
1127  cmp(value, scratch);
1128  br(EQ, Lexit);
1129  sub(count, count, 1);
1130  cbnz(count, Lloop);
1131  bind(Lexit);
1132}
1133
1134// scans count 4 byte words at [addr] for occurence of value,
1135// generic
1136void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1137                                Register scratch) {
1138  Label Lloop, Lexit;
1139  cbz(count, Lexit);
1140  bind(Lloop);
1141  ldrw(scratch, post(addr, wordSize));
1142  cmpw(value, scratch);
1143  br(EQ, Lexit);
1144  sub(count, count, 1);
1145  cbnz(count, Lloop);
1146  bind(Lexit);
1147}
1148
1149void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1150                                                   Register super_klass,
1151                                                   Register temp_reg,
1152                                                   Register temp2_reg,
1153                                                   Label* L_success,
1154                                                   Label* L_failure,
1155                                                   bool set_cond_codes) {
1156  assert_different_registers(sub_klass, super_klass, temp_reg);
1157  if (temp2_reg != noreg)
1158    assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1159#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1160
1161  Label L_fallthrough;
1162  int label_nulls = 0;
1163  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1164  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1165  assert(label_nulls <= 1, "at most one NULL in the batch");
1166
1167  // a couple of useful fields in sub_klass:
1168  int ss_offset = in_bytes(Klass::secondary_supers_offset());
1169  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1170  Address secondary_supers_addr(sub_klass, ss_offset);
1171  Address super_cache_addr(     sub_klass, sc_offset);
1172
1173  BLOCK_COMMENT("check_klass_subtype_slow_path");
1174
1175  // Do a linear scan of the secondary super-klass chain.
1176  // This code is rarely used, so simplicity is a virtue here.
1177  // The repne_scan instruction uses fixed registers, which we must spill.
1178  // Don't worry too much about pre-existing connections with the input regs.
1179
1180  assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1181  assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1182
1183  // Get super_klass value into r0 (even if it was in r5 or r2).
1184  RegSet pushed_registers;
1185  if (!IS_A_TEMP(r2))    pushed_registers += r2;
1186  if (!IS_A_TEMP(r5))    pushed_registers += r5;
1187
1188  if (super_klass != r0 || UseCompressedOops) {
1189    if (!IS_A_TEMP(r0))   pushed_registers += r0;
1190  }
1191
1192  push(pushed_registers, sp);
1193
1194#ifndef PRODUCT
1195  mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1196  Address pst_counter_addr(rscratch2);
1197  ldr(rscratch1, pst_counter_addr);
1198  add(rscratch1, rscratch1, 1);
1199  str(rscratch1, pst_counter_addr);
1200#endif //PRODUCT
1201
1202  // We will consult the secondary-super array.
1203  ldr(r5, secondary_supers_addr);
1204  // Load the array length.
1205  ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1206  // Skip to start of data.
1207  add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1208
1209  cmp(sp, zr); // Clear Z flag; SP is never zero
1210  // Scan R2 words at [R5] for an occurrence of R0.
1211  // Set NZ/Z based on last compare.
1212  repne_scan(r5, r0, r2, rscratch1);
1213
1214  // Unspill the temp. registers:
1215  pop(pushed_registers, sp);
1216
1217  br(Assembler::NE, *L_failure);
1218
1219  // Success.  Cache the super we found and proceed in triumph.
1220  str(super_klass, super_cache_addr);
1221
1222  if (L_success != &L_fallthrough) {
1223    b(*L_success);
1224  }
1225
1226#undef IS_A_TEMP
1227
1228  bind(L_fallthrough);
1229}
1230
1231
1232void MacroAssembler::verify_oop(Register reg, const char* s) {
1233  if (!VerifyOops) return;
1234
1235  // Pass register number to verify_oop_subroutine
1236  const char* b = NULL;
1237  {
1238    ResourceMark rm;
1239    stringStream ss;
1240    ss.print("verify_oop: %s: %s", reg->name(), s);
1241    b = code_string(ss.as_string());
1242  }
1243  BLOCK_COMMENT("verify_oop {");
1244
1245  stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1246  stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1247
1248  mov(r0, reg);
1249  mov(rscratch1, (address)b);
1250
1251  // call indirectly to solve generation ordering problem
1252  lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1253  ldr(rscratch2, Address(rscratch2));
1254  blr(rscratch2);
1255
1256  ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1257  ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1258
1259  BLOCK_COMMENT("} verify_oop");
1260}
1261
1262void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1263  if (!VerifyOops) return;
1264
1265  const char* b = NULL;
1266  {
1267    ResourceMark rm;
1268    stringStream ss;
1269    ss.print("verify_oop_addr: %s", s);
1270    b = code_string(ss.as_string());
1271  }
1272  BLOCK_COMMENT("verify_oop_addr {");
1273
1274  stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1275  stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1276
1277  // addr may contain sp so we will have to adjust it based on the
1278  // pushes that we just did.
1279  if (addr.uses(sp)) {
1280    lea(r0, addr);
1281    ldr(r0, Address(r0, 4 * wordSize));
1282  } else {
1283    ldr(r0, addr);
1284  }
1285  mov(rscratch1, (address)b);
1286
1287  // call indirectly to solve generation ordering problem
1288  lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1289  ldr(rscratch2, Address(rscratch2));
1290  blr(rscratch2);
1291
1292  ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1293  ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1294
1295  BLOCK_COMMENT("} verify_oop_addr");
1296}
1297
1298Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1299                                         int extra_slot_offset) {
1300  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1301  int stackElementSize = Interpreter::stackElementSize;
1302  int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1303#ifdef ASSERT
1304  int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1305  assert(offset1 - offset == stackElementSize, "correct arithmetic");
1306#endif
1307  if (arg_slot.is_constant()) {
1308    return Address(esp, arg_slot.as_constant() * stackElementSize
1309                   + offset);
1310  } else {
1311    add(rscratch1, esp, arg_slot.as_register(),
1312        ext::uxtx, exact_log2(stackElementSize));
1313    return Address(rscratch1, offset);
1314  }
1315}
1316
1317void MacroAssembler::call_VM_leaf_base(address entry_point,
1318                                       int number_of_arguments,
1319                                       Label *retaddr) {
1320  call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1321}
1322
1323void MacroAssembler::call_VM_leaf_base1(address entry_point,
1324                                        int number_of_gp_arguments,
1325                                        int number_of_fp_arguments,
1326                                        ret_type type,
1327                                        Label *retaddr) {
1328  Label E, L;
1329
1330  stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1331
1332  // We add 1 to number_of_arguments because the thread in arg0 is
1333  // not counted
1334  mov(rscratch1, entry_point);
1335  blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1336  if (retaddr)
1337    bind(*retaddr);
1338
1339  ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1340  maybe_isb();
1341}
1342
1343void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1344  call_VM_leaf_base(entry_point, number_of_arguments);
1345}
1346
1347void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1348  pass_arg0(this, arg_0);
1349  call_VM_leaf_base(entry_point, 1);
1350}
1351
1352void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1353  pass_arg0(this, arg_0);
1354  pass_arg1(this, arg_1);
1355  call_VM_leaf_base(entry_point, 2);
1356}
1357
1358void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1359                                  Register arg_1, Register arg_2) {
1360  pass_arg0(this, arg_0);
1361  pass_arg1(this, arg_1);
1362  pass_arg2(this, arg_2);
1363  call_VM_leaf_base(entry_point, 3);
1364}
1365
1366void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1367  pass_arg0(this, arg_0);
1368  MacroAssembler::call_VM_leaf_base(entry_point, 1);
1369}
1370
1371void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1372
1373  assert(arg_0 != c_rarg1, "smashed arg");
1374  pass_arg1(this, arg_1);
1375  pass_arg0(this, arg_0);
1376  MacroAssembler::call_VM_leaf_base(entry_point, 2);
1377}
1378
1379void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1380  assert(arg_0 != c_rarg2, "smashed arg");
1381  assert(arg_1 != c_rarg2, "smashed arg");
1382  pass_arg2(this, arg_2);
1383  assert(arg_0 != c_rarg1, "smashed arg");
1384  pass_arg1(this, arg_1);
1385  pass_arg0(this, arg_0);
1386  MacroAssembler::call_VM_leaf_base(entry_point, 3);
1387}
1388
1389void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1390  assert(arg_0 != c_rarg3, "smashed arg");
1391  assert(arg_1 != c_rarg3, "smashed arg");
1392  assert(arg_2 != c_rarg3, "smashed arg");
1393  pass_arg3(this, arg_3);
1394  assert(arg_0 != c_rarg2, "smashed arg");
1395  assert(arg_1 != c_rarg2, "smashed arg");
1396  pass_arg2(this, arg_2);
1397  assert(arg_0 != c_rarg1, "smashed arg");
1398  pass_arg1(this, arg_1);
1399  pass_arg0(this, arg_0);
1400  MacroAssembler::call_VM_leaf_base(entry_point, 4);
1401}
1402
1403void MacroAssembler::null_check(Register reg, int offset) {
1404  if (needs_explicit_null_check(offset)) {
1405    // provoke OS NULL exception if reg = NULL by
1406    // accessing M[reg] w/o changing any registers
1407    // NOTE: this is plenty to provoke a segv
1408    ldr(zr, Address(reg));
1409  } else {
1410    // nothing to do, (later) access of M[reg + offset]
1411    // will provoke OS NULL exception if reg = NULL
1412  }
1413}
1414
1415// MacroAssembler protected routines needed to implement
1416// public methods
1417
1418void MacroAssembler::mov(Register r, Address dest) {
1419  code_section()->relocate(pc(), dest.rspec());
1420  u_int64_t imm64 = (u_int64_t)dest.target();
1421  movptr(r, imm64);
1422}
1423
1424// Move a constant pointer into r.  In AArch64 mode the virtual
1425// address space is 48 bits in size, so we only need three
1426// instructions to create a patchable instruction sequence that can
1427// reach anywhere.
1428void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1429#ifndef PRODUCT
1430  {
1431    char buffer[64];
1432    snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1433    block_comment(buffer);
1434  }
1435#endif
1436  assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1437  movz(r, imm64 & 0xffff);
1438  imm64 >>= 16;
1439  movk(r, imm64 & 0xffff, 16);
1440  imm64 >>= 16;
1441  movk(r, imm64 & 0xffff, 32);
1442}
1443
1444// Macro to mov replicated immediate to vector register.
1445//  Vd will get the following values for different arrangements in T
1446//   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1447//   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1448//   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1449//   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1450//   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1451//   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1452//   T1D/T2D: invalid
1453void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1454  assert(T != T1D && T != T2D, "invalid arrangement");
1455  if (T == T8B || T == T16B) {
1456    assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1457    movi(Vd, T, imm32 & 0xff, 0);
1458    return;
1459  }
1460  u_int32_t nimm32 = ~imm32;
1461  if (T == T4H || T == T8H) {
1462    assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1463    imm32 &= 0xffff;
1464    nimm32 &= 0xffff;
1465  }
1466  u_int32_t x = imm32;
1467  int movi_cnt = 0;
1468  int movn_cnt = 0;
1469  while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1470  x = nimm32;
1471  while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1472  if (movn_cnt < movi_cnt) imm32 = nimm32;
1473  unsigned lsl = 0;
1474  while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1475  if (movn_cnt < movi_cnt)
1476    mvni(Vd, T, imm32 & 0xff, lsl);
1477  else
1478    movi(Vd, T, imm32 & 0xff, lsl);
1479  imm32 >>= 8; lsl += 8;
1480  while (imm32) {
1481    while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1482    if (movn_cnt < movi_cnt)
1483      bici(Vd, T, imm32 & 0xff, lsl);
1484    else
1485      orri(Vd, T, imm32 & 0xff, lsl);
1486    lsl += 8; imm32 >>= 8;
1487  }
1488}
1489
1490void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1491{
1492#ifndef PRODUCT
1493  {
1494    char buffer[64];
1495    snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1496    block_comment(buffer);
1497  }
1498#endif
1499  if (operand_valid_for_logical_immediate(false, imm64)) {
1500    orr(dst, zr, imm64);
1501  } else {
1502    // we can use a combination of MOVZ or MOVN with
1503    // MOVK to build up the constant
1504    u_int64_t imm_h[4];
1505    int zero_count = 0;
1506    int neg_count = 0;
1507    int i;
1508    for (i = 0; i < 4; i++) {
1509      imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1510      if (imm_h[i] == 0) {
1511        zero_count++;
1512      } else if (imm_h[i] == 0xffffL) {
1513        neg_count++;
1514      }
1515    }
1516    if (zero_count == 4) {
1517      // one MOVZ will do
1518      movz(dst, 0);
1519    } else if (neg_count == 4) {
1520      // one MOVN will do
1521      movn(dst, 0);
1522    } else if (zero_count == 3) {
1523      for (i = 0; i < 4; i++) {
1524        if (imm_h[i] != 0L) {
1525          movz(dst, (u_int32_t)imm_h[i], (i << 4));
1526          break;
1527        }
1528      }
1529    } else if (neg_count == 3) {
1530      // one MOVN will do
1531      for (int i = 0; i < 4; i++) {
1532        if (imm_h[i] != 0xffffL) {
1533          movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1534          break;
1535        }
1536      }
1537    } else if (zero_count == 2) {
1538      // one MOVZ and one MOVK will do
1539      for (i = 0; i < 3; i++) {
1540        if (imm_h[i] != 0L) {
1541          movz(dst, (u_int32_t)imm_h[i], (i << 4));
1542          i++;
1543          break;
1544        }
1545      }
1546      for (;i < 4; i++) {
1547        if (imm_h[i] != 0L) {
1548          movk(dst, (u_int32_t)imm_h[i], (i << 4));
1549        }
1550      }
1551    } else if (neg_count == 2) {
1552      // one MOVN and one MOVK will do
1553      for (i = 0; i < 4; i++) {
1554        if (imm_h[i] != 0xffffL) {
1555          movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1556          i++;
1557          break;
1558        }
1559      }
1560      for (;i < 4; i++) {
1561        if (imm_h[i] != 0xffffL) {
1562          movk(dst, (u_int32_t)imm_h[i], (i << 4));
1563        }
1564      }
1565    } else if (zero_count == 1) {
1566      // one MOVZ and two MOVKs will do
1567      for (i = 0; i < 4; i++) {
1568        if (imm_h[i] != 0L) {
1569          movz(dst, (u_int32_t)imm_h[i], (i << 4));
1570          i++;
1571          break;
1572        }
1573      }
1574      for (;i < 4; i++) {
1575        if (imm_h[i] != 0x0L) {
1576          movk(dst, (u_int32_t)imm_h[i], (i << 4));
1577        }
1578      }
1579    } else if (neg_count == 1) {
1580      // one MOVN and two MOVKs will do
1581      for (i = 0; i < 4; i++) {
1582        if (imm_h[i] != 0xffffL) {
1583          movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1584          i++;
1585          break;
1586        }
1587      }
1588      for (;i < 4; i++) {
1589        if (imm_h[i] != 0xffffL) {
1590          movk(dst, (u_int32_t)imm_h[i], (i << 4));
1591        }
1592      }
1593    } else {
1594      // use a MOVZ and 3 MOVKs (makes it easier to debug)
1595      movz(dst, (u_int32_t)imm_h[0], 0);
1596      for (i = 1; i < 4; i++) {
1597        movk(dst, (u_int32_t)imm_h[i], (i << 4));
1598      }
1599    }
1600  }
1601}
1602
1603void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1604{
1605#ifndef PRODUCT
1606    {
1607      char buffer[64];
1608      snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1609      block_comment(buffer);
1610    }
1611#endif
1612  if (operand_valid_for_logical_immediate(true, imm32)) {
1613    orrw(dst, zr, imm32);
1614  } else {
1615    // we can use MOVZ, MOVN or two calls to MOVK to build up the
1616    // constant
1617    u_int32_t imm_h[2];
1618    imm_h[0] = imm32 & 0xffff;
1619    imm_h[1] = ((imm32 >> 16) & 0xffff);
1620    if (imm_h[0] == 0) {
1621      movzw(dst, imm_h[1], 16);
1622    } else if (imm_h[0] == 0xffff) {
1623      movnw(dst, imm_h[1] ^ 0xffff, 16);
1624    } else if (imm_h[1] == 0) {
1625      movzw(dst, imm_h[0], 0);
1626    } else if (imm_h[1] == 0xffff) {
1627      movnw(dst, imm_h[0] ^ 0xffff, 0);
1628    } else {
1629      // use a MOVZ and MOVK (makes it easier to debug)
1630      movzw(dst, imm_h[0], 0);
1631      movkw(dst, imm_h[1], 16);
1632    }
1633  }
1634}
1635
1636// Form an address from base + offset in Rd.  Rd may or may
1637// not actually be used: you must use the Address that is returned.
1638// It is up to you to ensure that the shift provided matches the size
1639// of your data.
1640Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1641  if (Address::offset_ok_for_immed(byte_offset, shift))
1642    // It fits; no need for any heroics
1643    return Address(base, byte_offset);
1644
1645  // Don't do anything clever with negative or misaligned offsets
1646  unsigned mask = (1 << shift) - 1;
1647  if (byte_offset < 0 || byte_offset & mask) {
1648    mov(Rd, byte_offset);
1649    add(Rd, base, Rd);
1650    return Address(Rd);
1651  }
1652
1653  // See if we can do this with two 12-bit offsets
1654  {
1655    unsigned long word_offset = byte_offset >> shift;
1656    unsigned long masked_offset = word_offset & 0xfff000;
1657    if (Address::offset_ok_for_immed(word_offset - masked_offset)
1658        && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1659      add(Rd, base, masked_offset << shift);
1660      word_offset -= masked_offset;
1661      return Address(Rd, word_offset << shift);
1662    }
1663  }
1664
1665  // Do it the hard way
1666  mov(Rd, byte_offset);
1667  add(Rd, base, Rd);
1668  return Address(Rd);
1669}
1670
1671void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1672  if (UseLSE) {
1673    mov(tmp, 1);
1674    ldadd(Assembler::word, tmp, zr, counter_addr);
1675    return;
1676  }
1677  Label retry_load;
1678  if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1679    prfm(Address(counter_addr), PSTL1STRM);
1680  bind(retry_load);
1681  // flush and load exclusive from the memory location
1682  ldxrw(tmp, counter_addr);
1683  addw(tmp, tmp, 1);
1684  // if we store+flush with no intervening write tmp wil be zero
1685  stxrw(tmp2, tmp, counter_addr);
1686  cbnzw(tmp2, retry_load);
1687}
1688
1689
1690int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1691                                    bool want_remainder, Register scratch)
1692{
1693  // Full implementation of Java idiv and irem.  The function
1694  // returns the (pc) offset of the div instruction - may be needed
1695  // for implicit exceptions.
1696  //
1697  // constraint : ra/rb =/= scratch
1698  //         normal case
1699  //
1700  // input : ra: dividend
1701  //         rb: divisor
1702  //
1703  // result: either
1704  //         quotient  (= ra idiv rb)
1705  //         remainder (= ra irem rb)
1706
1707  assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1708
1709  int idivl_offset = offset();
1710  if (! want_remainder) {
1711    sdivw(result, ra, rb);
1712  } else {
1713    sdivw(scratch, ra, rb);
1714    Assembler::msubw(result, scratch, rb, ra);
1715  }
1716
1717  return idivl_offset;
1718}
1719
1720int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1721                                    bool want_remainder, Register scratch)
1722{
1723  // Full implementation of Java ldiv and lrem.  The function
1724  // returns the (pc) offset of the div instruction - may be needed
1725  // for implicit exceptions.
1726  //
1727  // constraint : ra/rb =/= scratch
1728  //         normal case
1729  //
1730  // input : ra: dividend
1731  //         rb: divisor
1732  //
1733  // result: either
1734  //         quotient  (= ra idiv rb)
1735  //         remainder (= ra irem rb)
1736
1737  assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1738
1739  int idivq_offset = offset();
1740  if (! want_remainder) {
1741    sdiv(result, ra, rb);
1742  } else {
1743    sdiv(scratch, ra, rb);
1744    Assembler::msub(result, scratch, rb, ra);
1745  }
1746
1747  return idivq_offset;
1748}
1749
1750void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1751  address prev = pc() - NativeMembar::instruction_size;
1752  if (prev == code()->last_membar()) {
1753    NativeMembar *bar = NativeMembar_at(prev);
1754    // We are merging two memory barrier instructions.  On AArch64 we
1755    // can do this simply by ORing them together.
1756    bar->set_kind(bar->get_kind() | order_constraint);
1757    BLOCK_COMMENT("merged membar");
1758  } else {
1759    code()->set_last_membar(pc());
1760    dmb(Assembler::barrier(order_constraint));
1761  }
1762}
1763
1764// MacroAssembler routines found actually to be needed
1765
1766void MacroAssembler::push(Register src)
1767{
1768  str(src, Address(pre(esp, -1 * wordSize)));
1769}
1770
1771void MacroAssembler::pop(Register dst)
1772{
1773  ldr(dst, Address(post(esp, 1 * wordSize)));
1774}
1775
1776// Note: load_unsigned_short used to be called load_unsigned_word.
1777int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1778  int off = offset();
1779  ldrh(dst, src);
1780  return off;
1781}
1782
1783int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1784  int off = offset();
1785  ldrb(dst, src);
1786  return off;
1787}
1788
1789int MacroAssembler::load_signed_short(Register dst, Address src) {
1790  int off = offset();
1791  ldrsh(dst, src);
1792  return off;
1793}
1794
1795int MacroAssembler::load_signed_byte(Register dst, Address src) {
1796  int off = offset();
1797  ldrsb(dst, src);
1798  return off;
1799}
1800
1801int MacroAssembler::load_signed_short32(Register dst, Address src) {
1802  int off = offset();
1803  ldrshw(dst, src);
1804  return off;
1805}
1806
1807int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1808  int off = offset();
1809  ldrsbw(dst, src);
1810  return off;
1811}
1812
1813void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1814  switch (size_in_bytes) {
1815  case  8:  ldr(dst, src); break;
1816  case  4:  ldrw(dst, src); break;
1817  case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1818  case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1819  default:  ShouldNotReachHere();
1820  }
1821}
1822
1823void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1824  switch (size_in_bytes) {
1825  case  8:  str(src, dst); break;
1826  case  4:  strw(src, dst); break;
1827  case  2:  strh(src, dst); break;
1828  case  1:  strb(src, dst); break;
1829  default:  ShouldNotReachHere();
1830  }
1831}
1832
1833void MacroAssembler::decrementw(Register reg, int value)
1834{
1835  if (value < 0)  { incrementw(reg, -value);      return; }
1836  if (value == 0) {                               return; }
1837  if (value < (1 << 12)) { subw(reg, reg, value); return; }
1838  /* else */ {
1839    guarantee(reg != rscratch2, "invalid dst for register decrement");
1840    movw(rscratch2, (unsigned)value);
1841    subw(reg, reg, rscratch2);
1842  }
1843}
1844
1845void MacroAssembler::decrement(Register reg, int value)
1846{
1847  if (value < 0)  { increment(reg, -value);      return; }
1848  if (value == 0) {                              return; }
1849  if (value < (1 << 12)) { sub(reg, reg, value); return; }
1850  /* else */ {
1851    assert(reg != rscratch2, "invalid dst for register decrement");
1852    mov(rscratch2, (unsigned long)value);
1853    sub(reg, reg, rscratch2);
1854  }
1855}
1856
1857void MacroAssembler::decrementw(Address dst, int value)
1858{
1859  assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1860  ldrw(rscratch1, dst);
1861  decrementw(rscratch1, value);
1862  strw(rscratch1, dst);
1863}
1864
1865void MacroAssembler::decrement(Address dst, int value)
1866{
1867  assert(!dst.uses(rscratch1), "invalid address for decrement");
1868  ldr(rscratch1, dst);
1869  decrement(rscratch1, value);
1870  str(rscratch1, dst);
1871}
1872
1873void MacroAssembler::incrementw(Register reg, int value)
1874{
1875  if (value < 0)  { decrementw(reg, -value);      return; }
1876  if (value == 0) {                               return; }
1877  if (value < (1 << 12)) { addw(reg, reg, value); return; }
1878  /* else */ {
1879    assert(reg != rscratch2, "invalid dst for register increment");
1880    movw(rscratch2, (unsigned)value);
1881    addw(reg, reg, rscratch2);
1882  }
1883}
1884
1885void MacroAssembler::increment(Register reg, int value)
1886{
1887  if (value < 0)  { decrement(reg, -value);      return; }
1888  if (value == 0) {                              return; }
1889  if (value < (1 << 12)) { add(reg, reg, value); return; }
1890  /* else */ {
1891    assert(reg != rscratch2, "invalid dst for register increment");
1892    movw(rscratch2, (unsigned)value);
1893    add(reg, reg, rscratch2);
1894  }
1895}
1896
1897void MacroAssembler::incrementw(Address dst, int value)
1898{
1899  assert(!dst.uses(rscratch1), "invalid dst for address increment");
1900  ldrw(rscratch1, dst);
1901  incrementw(rscratch1, value);
1902  strw(rscratch1, dst);
1903}
1904
1905void MacroAssembler::increment(Address dst, int value)
1906{
1907  assert(!dst.uses(rscratch1), "invalid dst for address increment");
1908  ldr(rscratch1, dst);
1909  increment(rscratch1, value);
1910  str(rscratch1, dst);
1911}
1912
1913
1914void MacroAssembler::pusha() {
1915  push(0x7fffffff, sp);
1916}
1917
1918void MacroAssembler::popa() {
1919  pop(0x7fffffff, sp);
1920}
1921
1922// Push lots of registers in the bit set supplied.  Don't push sp.
1923// Return the number of words pushed
1924int MacroAssembler::push(unsigned int bitset, Register stack) {
1925  int words_pushed = 0;
1926
1927  // Scan bitset to accumulate register pairs
1928  unsigned char regs[32];
1929  int count = 0;
1930  for (int reg = 0; reg <= 30; reg++) {
1931    if (1 & bitset)
1932      regs[count++] = reg;
1933    bitset >>= 1;
1934  }
1935  regs[count++] = zr->encoding_nocheck();
1936  count &= ~1;  // Only push an even nuber of regs
1937
1938  if (count) {
1939    stp(as_Register(regs[0]), as_Register(regs[1]),
1940       Address(pre(stack, -count * wordSize)));
1941    words_pushed += 2;
1942  }
1943  for (int i = 2; i < count; i += 2) {
1944    stp(as_Register(regs[i]), as_Register(regs[i+1]),
1945       Address(stack, i * wordSize));
1946    words_pushed += 2;
1947  }
1948
1949  assert(words_pushed == count, "oops, pushed != count");
1950
1951  return count;
1952}
1953
1954int MacroAssembler::pop(unsigned int bitset, Register stack) {
1955  int words_pushed = 0;
1956
1957  // Scan bitset to accumulate register pairs
1958  unsigned char regs[32];
1959  int count = 0;
1960  for (int reg = 0; reg <= 30; reg++) {
1961    if (1 & bitset)
1962      regs[count++] = reg;
1963    bitset >>= 1;
1964  }
1965  regs[count++] = zr->encoding_nocheck();
1966  count &= ~1;
1967
1968  for (int i = 2; i < count; i += 2) {
1969    ldp(as_Register(regs[i]), as_Register(regs[i+1]),
1970       Address(stack, i * wordSize));
1971    words_pushed += 2;
1972  }
1973  if (count) {
1974    ldp(as_Register(regs[0]), as_Register(regs[1]),
1975       Address(post(stack, count * wordSize)));
1976    words_pushed += 2;
1977  }
1978
1979  assert(words_pushed == count, "oops, pushed != count");
1980
1981  return count;
1982}
1983#ifdef ASSERT
1984void MacroAssembler::verify_heapbase(const char* msg) {
1985#if 0
1986  assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
1987  assert (Universe::heap() != NULL, "java heap should be initialized");
1988  if (CheckCompressedOops) {
1989    Label ok;
1990    push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
1991    cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
1992    br(Assembler::EQ, ok);
1993    stop(msg);
1994    bind(ok);
1995    pop(1 << rscratch1->encoding(), sp);
1996  }
1997#endif
1998}
1999#endif
2000
2001void MacroAssembler::stop(const char* msg) {
2002  address ip = pc();
2003  pusha();
2004  mov(c_rarg0, (address)msg);
2005  mov(c_rarg1, (address)ip);
2006  mov(c_rarg2, sp);
2007  mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2008  // call(c_rarg3);
2009  blrt(c_rarg3, 3, 0, 1);
2010  hlt(0);
2011}
2012
2013// If a constant does not fit in an immediate field, generate some
2014// number of MOV instructions and then perform the operation.
2015void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2016                                           add_sub_imm_insn insn1,
2017                                           add_sub_reg_insn insn2) {
2018  assert(Rd != zr, "Rd = zr and not setting flags?");
2019  if (operand_valid_for_add_sub_immediate((int)imm)) {
2020    (this->*insn1)(Rd, Rn, imm);
2021  } else {
2022    if (uabs(imm) < (1 << 24)) {
2023       (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2024       (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2025    } else {
2026       assert_different_registers(Rd, Rn);
2027       mov(Rd, (uint64_t)imm);
2028       (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2029    }
2030  }
2031}
2032
2033// Seperate vsn which sets the flags. Optimisations are more restricted
2034// because we must set the flags correctly.
2035void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2036                                           add_sub_imm_insn insn1,
2037                                           add_sub_reg_insn insn2) {
2038  if (operand_valid_for_add_sub_immediate((int)imm)) {
2039    (this->*insn1)(Rd, Rn, imm);
2040  } else {
2041    assert_different_registers(Rd, Rn);
2042    assert(Rd != zr, "overflow in immediate operand");
2043    mov(Rd, (uint64_t)imm);
2044    (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2045  }
2046}
2047
2048
2049void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2050  if (increment.is_register()) {
2051    add(Rd, Rn, increment.as_register());
2052  } else {
2053    add(Rd, Rn, increment.as_constant());
2054  }
2055}
2056
2057void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2058  if (increment.is_register()) {
2059    addw(Rd, Rn, increment.as_register());
2060  } else {
2061    addw(Rd, Rn, increment.as_constant());
2062  }
2063}
2064
2065void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2066  if (decrement.is_register()) {
2067    sub(Rd, Rn, decrement.as_register());
2068  } else {
2069    sub(Rd, Rn, decrement.as_constant());
2070  }
2071}
2072
2073void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2074  if (decrement.is_register()) {
2075    subw(Rd, Rn, decrement.as_register());
2076  } else {
2077    subw(Rd, Rn, decrement.as_constant());
2078  }
2079}
2080
2081void MacroAssembler::reinit_heapbase()
2082{
2083  if (UseCompressedOops) {
2084    if (Universe::is_fully_initialized()) {
2085      mov(rheapbase, Universe::narrow_ptrs_base());
2086    } else {
2087      lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2088      ldr(rheapbase, Address(rheapbase));
2089    }
2090  }
2091}
2092
2093// this simulates the behaviour of the x86 cmpxchg instruction using a
2094// load linked/store conditional pair. we use the acquire/release
2095// versions of these instructions so that we flush pending writes as
2096// per Java semantics.
2097
2098// n.b the x86 version assumes the old value to be compared against is
2099// in rax and updates rax with the value located in memory if the
2100// cmpxchg fails. we supply a register for the old value explicitly
2101
2102// the aarch64 load linked/store conditional instructions do not
2103// accept an offset. so, unlike x86, we must provide a plain register
2104// to identify the memory word to be compared/exchanged rather than a
2105// register+offset Address.
2106
2107void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2108                                Label &succeed, Label *fail) {
2109  // oldv holds comparison value
2110  // newv holds value to write in exchange
2111  // addr identifies memory word to compare against/update
2112  if (UseLSE) {
2113    mov(tmp, oldv);
2114    casal(Assembler::xword, oldv, newv, addr);
2115    cmp(tmp, oldv);
2116    br(Assembler::EQ, succeed);
2117    membar(AnyAny);
2118  } else {
2119    Label retry_load, nope;
2120    if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2121      prfm(Address(addr), PSTL1STRM);
2122    bind(retry_load);
2123    // flush and load exclusive from the memory location
2124    // and fail if it is not what we expect
2125    ldaxr(tmp, addr);
2126    cmp(tmp, oldv);
2127    br(Assembler::NE, nope);
2128    // if we store+flush with no intervening write tmp wil be zero
2129    stlxr(tmp, newv, addr);
2130    cbzw(tmp, succeed);
2131    // retry so we only ever return after a load fails to compare
2132    // ensures we don't return a stale value after a failed write.
2133    b(retry_load);
2134    // if the memory word differs we return it in oldv and signal a fail
2135    bind(nope);
2136    membar(AnyAny);
2137    mov(oldv, tmp);
2138  }
2139  if (fail)
2140    b(*fail);
2141}
2142
2143void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2144                                Label &succeed, Label *fail) {
2145  // oldv holds comparison value
2146  // newv holds value to write in exchange
2147  // addr identifies memory word to compare against/update
2148  // tmp returns 0/1 for success/failure
2149  if (UseLSE) {
2150    mov(tmp, oldv);
2151    casal(Assembler::word, oldv, newv, addr);
2152    cmp(tmp, oldv);
2153    br(Assembler::EQ, succeed);
2154    membar(AnyAny);
2155  } else {
2156    Label retry_load, nope;
2157    if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2158      prfm(Address(addr), PSTL1STRM);
2159    bind(retry_load);
2160    // flush and load exclusive from the memory location
2161    // and fail if it is not what we expect
2162    ldaxrw(tmp, addr);
2163    cmp(tmp, oldv);
2164    br(Assembler::NE, nope);
2165    // if we store+flush with no intervening write tmp wil be zero
2166    stlxrw(tmp, newv, addr);
2167    cbzw(tmp, succeed);
2168    // retry so we only ever return after a load fails to compare
2169    // ensures we don't return a stale value after a failed write.
2170    b(retry_load);
2171    // if the memory word differs we return it in oldv and signal a fail
2172    bind(nope);
2173    membar(AnyAny);
2174    mov(oldv, tmp);
2175  }
2176  if (fail)
2177    b(*fail);
2178}
2179
2180// A generic CAS; success or failure is in the EQ flag.  A weak CAS
2181// doesn't retry and may fail spuriously.  If the oldval is wanted,
2182// Pass a register for the result, otherwise pass noreg.
2183
2184// Clobbers rscratch1
2185void MacroAssembler::cmpxchg(Register addr, Register expected,
2186                             Register new_val,
2187                             enum operand_size size,
2188                             bool acquire, bool release,
2189                             bool weak,
2190                             Register result) {
2191  if (result == noreg)  result = rscratch1;
2192  if (UseLSE) {
2193    mov(result, expected);
2194    lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2195    cmp(result, expected);
2196  } else {
2197    BLOCK_COMMENT("cmpxchg {");
2198    Label retry_load, done;
2199    if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2200      prfm(Address(addr), PSTL1STRM);
2201    bind(retry_load);
2202    load_exclusive(result, addr, size, acquire);
2203    if (size == xword)
2204      cmp(result, expected);
2205    else
2206      cmpw(result, expected);
2207    br(Assembler::NE, done);
2208    store_exclusive(rscratch1, new_val, addr, size, release);
2209    if (weak) {
2210      cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2211    } else {
2212      cbnzw(rscratch1, retry_load);
2213    }
2214    bind(done);
2215    BLOCK_COMMENT("} cmpxchg");
2216  }
2217}
2218
2219static bool different(Register a, RegisterOrConstant b, Register c) {
2220  if (b.is_constant())
2221    return a != c;
2222  else
2223    return a != b.as_register() && a != c && b.as_register() != c;
2224}
2225
2226#define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2227void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2228  if (UseLSE) {                                                         \
2229    prev = prev->is_valid() ? prev : zr;                                \
2230    if (incr.is_register()) {                                           \
2231      AOP(sz, incr.as_register(), prev, addr);                          \
2232    } else {                                                            \
2233      mov(rscratch2, incr.as_constant());                               \
2234      AOP(sz, rscratch2, prev, addr);                                   \
2235    }                                                                   \
2236    return;                                                             \
2237  }                                                                     \
2238  Register result = rscratch2;                                          \
2239  if (prev->is_valid())                                                 \
2240    result = different(prev, incr, addr) ? prev : rscratch2;            \
2241                                                                        \
2242  Label retry_load;                                                     \
2243  if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2244    prfm(Address(addr), PSTL1STRM);                                     \
2245  bind(retry_load);                                                     \
2246  LDXR(result, addr);                                                   \
2247  OP(rscratch1, result, incr);                                          \
2248  STXR(rscratch2, rscratch1, addr);                                     \
2249  cbnzw(rscratch2, retry_load);                                         \
2250  if (prev->is_valid() && prev != result) {                             \
2251    IOP(prev, rscratch1, incr);                                         \
2252  }                                                                     \
2253}
2254
2255ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2256ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2257ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2258ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2259
2260#undef ATOMIC_OP
2261
2262#define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2263void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2264  if (UseLSE) {                                                         \
2265    prev = prev->is_valid() ? prev : zr;                                \
2266    AOP(sz, newv, prev, addr);                                          \
2267    return;                                                             \
2268  }                                                                     \
2269  Register result = rscratch2;                                          \
2270  if (prev->is_valid())                                                 \
2271    result = different(prev, newv, addr) ? prev : rscratch2;            \
2272                                                                        \
2273  Label retry_load;                                                     \
2274  if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2275    prfm(Address(addr), PSTL1STRM);                                     \
2276  bind(retry_load);                                                     \
2277  LDXR(result, addr);                                                   \
2278  STXR(rscratch1, newv, addr);                                          \
2279  cbnzw(rscratch1, retry_load);                                         \
2280  if (prev->is_valid() && prev != result)                               \
2281    mov(prev, result);                                                  \
2282}
2283
2284ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2285ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2286ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2287ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2288
2289#undef ATOMIC_XCHG
2290
2291void MacroAssembler::incr_allocated_bytes(Register thread,
2292                                          Register var_size_in_bytes,
2293                                          int con_size_in_bytes,
2294                                          Register t1) {
2295  if (!thread->is_valid()) {
2296    thread = rthread;
2297  }
2298  assert(t1->is_valid(), "need temp reg");
2299
2300  ldr(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2301  if (var_size_in_bytes->is_valid()) {
2302    add(t1, t1, var_size_in_bytes);
2303  } else {
2304    add(t1, t1, con_size_in_bytes);
2305  }
2306  str(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2307}
2308
2309#ifndef PRODUCT
2310extern "C" void findpc(intptr_t x);
2311#endif
2312
2313void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2314{
2315  // In order to get locks to work, we need to fake a in_VM state
2316  if (ShowMessageBoxOnError ) {
2317    JavaThread* thread = JavaThread::current();
2318    JavaThreadState saved_state = thread->thread_state();
2319    thread->set_thread_state(_thread_in_vm);
2320#ifndef PRODUCT
2321    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2322      ttyLocker ttyl;
2323      BytecodeCounter::print();
2324    }
2325#endif
2326    if (os::message_box(msg, "Execution stopped, print registers?")) {
2327      ttyLocker ttyl;
2328      tty->print_cr(" pc = 0x%016lx", pc);
2329#ifndef PRODUCT
2330      tty->cr();
2331      findpc(pc);
2332      tty->cr();
2333#endif
2334      tty->print_cr(" r0 = 0x%016lx", regs[0]);
2335      tty->print_cr(" r1 = 0x%016lx", regs[1]);
2336      tty->print_cr(" r2 = 0x%016lx", regs[2]);
2337      tty->print_cr(" r3 = 0x%016lx", regs[3]);
2338      tty->print_cr(" r4 = 0x%016lx", regs[4]);
2339      tty->print_cr(" r5 = 0x%016lx", regs[5]);
2340      tty->print_cr(" r6 = 0x%016lx", regs[6]);
2341      tty->print_cr(" r7 = 0x%016lx", regs[7]);
2342      tty->print_cr(" r8 = 0x%016lx", regs[8]);
2343      tty->print_cr(" r9 = 0x%016lx", regs[9]);
2344      tty->print_cr("r10 = 0x%016lx", regs[10]);
2345      tty->print_cr("r11 = 0x%016lx", regs[11]);
2346      tty->print_cr("r12 = 0x%016lx", regs[12]);
2347      tty->print_cr("r13 = 0x%016lx", regs[13]);
2348      tty->print_cr("r14 = 0x%016lx", regs[14]);
2349      tty->print_cr("r15 = 0x%016lx", regs[15]);
2350      tty->print_cr("r16 = 0x%016lx", regs[16]);
2351      tty->print_cr("r17 = 0x%016lx", regs[17]);
2352      tty->print_cr("r18 = 0x%016lx", regs[18]);
2353      tty->print_cr("r19 = 0x%016lx", regs[19]);
2354      tty->print_cr("r20 = 0x%016lx", regs[20]);
2355      tty->print_cr("r21 = 0x%016lx", regs[21]);
2356      tty->print_cr("r22 = 0x%016lx", regs[22]);
2357      tty->print_cr("r23 = 0x%016lx", regs[23]);
2358      tty->print_cr("r24 = 0x%016lx", regs[24]);
2359      tty->print_cr("r25 = 0x%016lx", regs[25]);
2360      tty->print_cr("r26 = 0x%016lx", regs[26]);
2361      tty->print_cr("r27 = 0x%016lx", regs[27]);
2362      tty->print_cr("r28 = 0x%016lx", regs[28]);
2363      tty->print_cr("r30 = 0x%016lx", regs[30]);
2364      tty->print_cr("r31 = 0x%016lx", regs[31]);
2365      BREAKPOINT;
2366    }
2367    ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2368  } else {
2369    ttyLocker ttyl;
2370    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2371                    msg);
2372    assert(false, "DEBUG MESSAGE: %s", msg);
2373  }
2374}
2375
2376#ifdef BUILTIN_SIM
2377// routine to generate an x86 prolog for a stub function which
2378// bootstraps into the generated ARM code which directly follows the
2379// stub
2380//
2381// the argument encodes the number of general and fp registers
2382// passed by the caller and the callng convention (currently just
2383// the number of general registers and assumes C argument passing)
2384
2385extern "C" {
2386int aarch64_stub_prolog_size();
2387void aarch64_stub_prolog();
2388void aarch64_prolog();
2389}
2390
2391void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2392                                   address *prolog_ptr)
2393{
2394  int calltype = (((ret_type & 0x3) << 8) |
2395                  ((fp_arg_count & 0xf) << 4) |
2396                  (gp_arg_count & 0xf));
2397
2398  // the addresses for the x86 to ARM entry code we need to use
2399  address start = pc();
2400  // printf("start = %lx\n", start);
2401  int byteCount =  aarch64_stub_prolog_size();
2402  // printf("byteCount = %x\n", byteCount);
2403  int instructionCount = (byteCount + 3)/ 4;
2404  // printf("instructionCount = %x\n", instructionCount);
2405  for (int i = 0; i < instructionCount; i++) {
2406    nop();
2407  }
2408
2409  memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2410
2411  // write the address of the setup routine and the call format at the
2412  // end of into the copied code
2413  u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2414  if (prolog_ptr)
2415    patch_end[-2] = (u_int64_t)prolog_ptr;
2416  patch_end[-1] = calltype;
2417}
2418#endif
2419
2420void MacroAssembler::push_call_clobbered_registers() {
2421  push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2422
2423  // Push v0-v7, v16-v31.
2424  for (int i = 30; i >= 0; i -= 2) {
2425    if (i <= v7->encoding() || i >= v16->encoding()) {
2426        stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2427             Address(pre(sp, -2 * wordSize)));
2428    }
2429  }
2430}
2431
2432void MacroAssembler::pop_call_clobbered_registers() {
2433
2434  for (int i = 0; i < 32; i += 2) {
2435    if (i <= v7->encoding() || i >= v16->encoding()) {
2436      ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2437           Address(post(sp, 2 * wordSize)));
2438    }
2439  }
2440
2441  pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2442}
2443
2444void MacroAssembler::push_CPU_state(bool save_vectors) {
2445  push(0x3fffffff, sp);         // integer registers except lr & sp
2446
2447  if (!save_vectors) {
2448    for (int i = 30; i >= 0; i -= 2)
2449      stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2450           Address(pre(sp, -2 * wordSize)));
2451  } else {
2452    for (int i = 30; i >= 0; i -= 2)
2453      stpq(as_FloatRegister(i), as_FloatRegister(i+1),
2454           Address(pre(sp, -4 * wordSize)));
2455  }
2456}
2457
2458void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2459  if (!restore_vectors) {
2460    for (int i = 0; i < 32; i += 2)
2461      ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2462           Address(post(sp, 2 * wordSize)));
2463  } else {
2464    for (int i = 0; i < 32; i += 2)
2465      ldpq(as_FloatRegister(i), as_FloatRegister(i+1),
2466           Address(post(sp, 4 * wordSize)));
2467  }
2468
2469  pop(0x3fffffff, sp);         // integer registers except lr & sp
2470}
2471
2472/**
2473 * Helpers for multiply_to_len().
2474 */
2475void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2476                                     Register src1, Register src2) {
2477  adds(dest_lo, dest_lo, src1);
2478  adc(dest_hi, dest_hi, zr);
2479  adds(dest_lo, dest_lo, src2);
2480  adc(final_dest_hi, dest_hi, zr);
2481}
2482
2483// Generate an address from (r + r1 extend offset).  "size" is the
2484// size of the operand.  The result may be in rscratch2.
2485Address MacroAssembler::offsetted_address(Register r, Register r1,
2486                                          Address::extend ext, int offset, int size) {
2487  if (offset || (ext.shift() % size != 0)) {
2488    lea(rscratch2, Address(r, r1, ext));
2489    return Address(rscratch2, offset);
2490  } else {
2491    return Address(r, r1, ext);
2492  }
2493}
2494
2495Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2496{
2497  assert(offset >= 0, "spill to negative address?");
2498  // Offset reachable ?
2499  //   Not aligned - 9 bits signed offset
2500  //   Aligned - 12 bits unsigned offset shifted
2501  Register base = sp;
2502  if ((offset & (size-1)) && offset >= (1<<8)) {
2503    add(tmp, base, offset & ((1<<12)-1));
2504    base = tmp;
2505    offset &= -1<<12;
2506  }
2507
2508  if (offset >= (1<<12) * size) {
2509    add(tmp, base, offset & (((1<<12)-1)<<12));
2510    base = tmp;
2511    offset &= ~(((1<<12)-1)<<12);
2512  }
2513
2514  return Address(base, offset);
2515}
2516
2517/**
2518 * Multiply 64 bit by 64 bit first loop.
2519 */
2520void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2521                                           Register y, Register y_idx, Register z,
2522                                           Register carry, Register product,
2523                                           Register idx, Register kdx) {
2524  //
2525  //  jlong carry, x[], y[], z[];
2526  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2527  //    huge_128 product = y[idx] * x[xstart] + carry;
2528  //    z[kdx] = (jlong)product;
2529  //    carry  = (jlong)(product >>> 64);
2530  //  }
2531  //  z[xstart] = carry;
2532  //
2533
2534  Label L_first_loop, L_first_loop_exit;
2535  Label L_one_x, L_one_y, L_multiply;
2536
2537  subsw(xstart, xstart, 1);
2538  br(Assembler::MI, L_one_x);
2539
2540  lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2541  ldr(x_xstart, Address(rscratch1));
2542  ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2543
2544  bind(L_first_loop);
2545  subsw(idx, idx, 1);
2546  br(Assembler::MI, L_first_loop_exit);
2547  subsw(idx, idx, 1);
2548  br(Assembler::MI, L_one_y);
2549  lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2550  ldr(y_idx, Address(rscratch1));
2551  ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2552  bind(L_multiply);
2553
2554  // AArch64 has a multiply-accumulate instruction that we can't use
2555  // here because it has no way to process carries, so we have to use
2556  // separate add and adc instructions.  Bah.
2557  umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2558  mul(product, x_xstart, y_idx);
2559  adds(product, product, carry);
2560  adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2561
2562  subw(kdx, kdx, 2);
2563  ror(product, product, 32); // back to big-endian
2564  str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2565
2566  b(L_first_loop);
2567
2568  bind(L_one_y);
2569  ldrw(y_idx, Address(y,  0));
2570  b(L_multiply);
2571
2572  bind(L_one_x);
2573  ldrw(x_xstart, Address(x,  0));
2574  b(L_first_loop);
2575
2576  bind(L_first_loop_exit);
2577}
2578
2579/**
2580 * Multiply 128 bit by 128. Unrolled inner loop.
2581 *
2582 */
2583void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2584                                             Register carry, Register carry2,
2585                                             Register idx, Register jdx,
2586                                             Register yz_idx1, Register yz_idx2,
2587                                             Register tmp, Register tmp3, Register tmp4,
2588                                             Register tmp6, Register product_hi) {
2589
2590  //   jlong carry, x[], y[], z[];
2591  //   int kdx = ystart+1;
2592  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2593  //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2594  //     jlong carry2  = (jlong)(tmp3 >>> 64);
2595  //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2596  //     carry  = (jlong)(tmp4 >>> 64);
2597  //     z[kdx+idx+1] = (jlong)tmp3;
2598  //     z[kdx+idx] = (jlong)tmp4;
2599  //   }
2600  //   idx += 2;
2601  //   if (idx > 0) {
2602  //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2603  //     z[kdx+idx] = (jlong)yz_idx1;
2604  //     carry  = (jlong)(yz_idx1 >>> 64);
2605  //   }
2606  //
2607
2608  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2609
2610  lsrw(jdx, idx, 2);
2611
2612  bind(L_third_loop);
2613
2614  subsw(jdx, jdx, 1);
2615  br(Assembler::MI, L_third_loop_exit);
2616  subw(idx, idx, 4);
2617
2618  lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2619
2620  ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2621
2622  lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2623
2624  ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2625  ror(yz_idx2, yz_idx2, 32);
2626
2627  ldp(rscratch2, rscratch1, Address(tmp6, 0));
2628
2629  mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2630  umulh(tmp4, product_hi, yz_idx1);
2631
2632  ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2633  ror(rscratch2, rscratch2, 32);
2634
2635  mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2636  umulh(carry2, product_hi, yz_idx2);
2637
2638  // propagate sum of both multiplications into carry:tmp4:tmp3
2639  adds(tmp3, tmp3, carry);
2640  adc(tmp4, tmp4, zr);
2641  adds(tmp3, tmp3, rscratch1);
2642  adcs(tmp4, tmp4, tmp);
2643  adc(carry, carry2, zr);
2644  adds(tmp4, tmp4, rscratch2);
2645  adc(carry, carry, zr);
2646
2647  ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2648  ror(tmp4, tmp4, 32);
2649  stp(tmp4, tmp3, Address(tmp6, 0));
2650
2651  b(L_third_loop);
2652  bind (L_third_loop_exit);
2653
2654  andw (idx, idx, 0x3);
2655  cbz(idx, L_post_third_loop_done);
2656
2657  Label L_check_1;
2658  subsw(idx, idx, 2);
2659  br(Assembler::MI, L_check_1);
2660
2661  lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2662  ldr(yz_idx1, Address(rscratch1, 0));
2663  ror(yz_idx1, yz_idx1, 32);
2664  mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2665  umulh(tmp4, product_hi, yz_idx1);
2666  lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2667  ldr(yz_idx2, Address(rscratch1, 0));
2668  ror(yz_idx2, yz_idx2, 32);
2669
2670  add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2671
2672  ror(tmp3, tmp3, 32);
2673  str(tmp3, Address(rscratch1, 0));
2674
2675  bind (L_check_1);
2676
2677  andw (idx, idx, 0x1);
2678  subsw(idx, idx, 1);
2679  br(Assembler::MI, L_post_third_loop_done);
2680  ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2681  mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2682  umulh(carry2, tmp4, product_hi);
2683  ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2684
2685  add2_with_carry(carry2, tmp3, tmp4, carry);
2686
2687  strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2688  extr(carry, carry2, tmp3, 32);
2689
2690  bind(L_post_third_loop_done);
2691}
2692
2693/**
2694 * Code for BigInteger::multiplyToLen() instrinsic.
2695 *
2696 * r0: x
2697 * r1: xlen
2698 * r2: y
2699 * r3: ylen
2700 * r4:  z
2701 * r5: zlen
2702 * r10: tmp1
2703 * r11: tmp2
2704 * r12: tmp3
2705 * r13: tmp4
2706 * r14: tmp5
2707 * r15: tmp6
2708 * r16: tmp7
2709 *
2710 */
2711void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2712                                     Register z, Register zlen,
2713                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2714                                     Register tmp5, Register tmp6, Register product_hi) {
2715
2716  assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2717
2718  const Register idx = tmp1;
2719  const Register kdx = tmp2;
2720  const Register xstart = tmp3;
2721
2722  const Register y_idx = tmp4;
2723  const Register carry = tmp5;
2724  const Register product  = xlen;
2725  const Register x_xstart = zlen;  // reuse register
2726
2727  // First Loop.
2728  //
2729  //  final static long LONG_MASK = 0xffffffffL;
2730  //  int xstart = xlen - 1;
2731  //  int ystart = ylen - 1;
2732  //  long carry = 0;
2733  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2734  //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
2735  //    z[kdx] = (int)product;
2736  //    carry = product >>> 32;
2737  //  }
2738  //  z[xstart] = (int)carry;
2739  //
2740
2741  movw(idx, ylen);      // idx = ylen;
2742  movw(kdx, zlen);      // kdx = xlen+ylen;
2743  mov(carry, zr);       // carry = 0;
2744
2745  Label L_done;
2746
2747  movw(xstart, xlen);
2748  subsw(xstart, xstart, 1);
2749  br(Assembler::MI, L_done);
2750
2751  multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
2752
2753  Label L_second_loop;
2754  cbzw(kdx, L_second_loop);
2755
2756  Label L_carry;
2757  subw(kdx, kdx, 1);
2758  cbzw(kdx, L_carry);
2759
2760  strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2761  lsr(carry, carry, 32);
2762  subw(kdx, kdx, 1);
2763
2764  bind(L_carry);
2765  strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2766
2767  // Second and third (nested) loops.
2768  //
2769  // for (int i = xstart-1; i >= 0; i--) { // Second loop
2770  //   carry = 0;
2771  //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
2772  //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
2773  //                    (z[k] & LONG_MASK) + carry;
2774  //     z[k] = (int)product;
2775  //     carry = product >>> 32;
2776  //   }
2777  //   z[i] = (int)carry;
2778  // }
2779  //
2780  // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
2781
2782  const Register jdx = tmp1;
2783
2784  bind(L_second_loop);
2785  mov(carry, zr);                // carry = 0;
2786  movw(jdx, ylen);               // j = ystart+1
2787
2788  subsw(xstart, xstart, 1);      // i = xstart-1;
2789  br(Assembler::MI, L_done);
2790
2791  str(z, Address(pre(sp, -4 * wordSize)));
2792
2793  Label L_last_x;
2794  lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
2795  subsw(xstart, xstart, 1);       // i = xstart-1;
2796  br(Assembler::MI, L_last_x);
2797
2798  lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
2799  ldr(product_hi, Address(rscratch1));
2800  ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
2801
2802  Label L_third_loop_prologue;
2803  bind(L_third_loop_prologue);
2804
2805  str(ylen, Address(sp, wordSize));
2806  stp(x, xstart, Address(sp, 2 * wordSize));
2807  multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
2808                          tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
2809  ldp(z, ylen, Address(post(sp, 2 * wordSize)));
2810  ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
2811
2812  addw(tmp3, xlen, 1);
2813  strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2814  subsw(tmp3, tmp3, 1);
2815  br(Assembler::MI, L_done);
2816
2817  lsr(carry, carry, 32);
2818  strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2819  b(L_second_loop);
2820
2821  // Next infrequent code is moved outside loops.
2822  bind(L_last_x);
2823  ldrw(product_hi, Address(x,  0));
2824  b(L_third_loop_prologue);
2825
2826  bind(L_done);
2827}
2828
2829/**
2830 * Emits code to update CRC-32 with a byte value according to constants in table
2831 *
2832 * @param [in,out]crc   Register containing the crc.
2833 * @param [in]val       Register containing the byte to fold into the CRC.
2834 * @param [in]table     Register containing the table of crc constants.
2835 *
2836 * uint32_t crc;
2837 * val = crc_table[(val ^ crc) & 0xFF];
2838 * crc = val ^ (crc >> 8);
2839 *
2840 */
2841void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2842  eor(val, val, crc);
2843  andr(val, val, 0xff);
2844  ldrw(val, Address(table, val, Address::lsl(2)));
2845  eor(crc, val, crc, Assembler::LSR, 8);
2846}
2847
2848/**
2849 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
2850 *
2851 * @param [in,out]crc   Register containing the crc.
2852 * @param [in]v         Register containing the 32-bit to fold into the CRC.
2853 * @param [in]table0    Register containing table 0 of crc constants.
2854 * @param [in]table1    Register containing table 1 of crc constants.
2855 * @param [in]table2    Register containing table 2 of crc constants.
2856 * @param [in]table3    Register containing table 3 of crc constants.
2857 *
2858 * uint32_t crc;
2859 *   v = crc ^ v
2860 *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
2861 *
2862 */
2863void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
2864        Register table0, Register table1, Register table2, Register table3,
2865        bool upper) {
2866  eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
2867  uxtb(tmp, v);
2868  ldrw(crc, Address(table3, tmp, Address::lsl(2)));
2869  ubfx(tmp, v, 8, 8);
2870  ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
2871  eor(crc, crc, tmp);
2872  ubfx(tmp, v, 16, 8);
2873  ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
2874  eor(crc, crc, tmp);
2875  ubfx(tmp, v, 24, 8);
2876  ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
2877  eor(crc, crc, tmp);
2878}
2879
2880/**
2881 * @param crc   register containing existing CRC (32-bit)
2882 * @param buf   register pointing to input byte buffer (byte*)
2883 * @param len   register containing number of bytes
2884 * @param table register that will contain address of CRC table
2885 * @param tmp   scratch register
2886 */
2887void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2888        Register table0, Register table1, Register table2, Register table3,
2889        Register tmp, Register tmp2, Register tmp3) {
2890  Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
2891  unsigned long offset;
2892
2893    ornw(crc, zr, crc);
2894
2895  if (UseCRC32) {
2896    Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop;
2897
2898      subs(len, len, 64);
2899      br(Assembler::GE, CRC_by64_loop);
2900      adds(len, len, 64-4);
2901      br(Assembler::GE, CRC_by4_loop);
2902      adds(len, len, 4);
2903      br(Assembler::GT, CRC_by1_loop);
2904      b(L_exit);
2905
2906    BIND(CRC_by4_loop);
2907      ldrw(tmp, Address(post(buf, 4)));
2908      subs(len, len, 4);
2909      crc32w(crc, crc, tmp);
2910      br(Assembler::GE, CRC_by4_loop);
2911      adds(len, len, 4);
2912      br(Assembler::LE, L_exit);
2913    BIND(CRC_by1_loop);
2914      ldrb(tmp, Address(post(buf, 1)));
2915      subs(len, len, 1);
2916      crc32b(crc, crc, tmp);
2917      br(Assembler::GT, CRC_by1_loop);
2918      b(L_exit);
2919
2920      align(CodeEntryAlignment);
2921    BIND(CRC_by64_loop);
2922      subs(len, len, 64);
2923      ldp(tmp, tmp3, Address(post(buf, 16)));
2924      crc32x(crc, crc, tmp);
2925      crc32x(crc, crc, tmp3);
2926      ldp(tmp, tmp3, Address(post(buf, 16)));
2927      crc32x(crc, crc, tmp);
2928      crc32x(crc, crc, tmp3);
2929      ldp(tmp, tmp3, Address(post(buf, 16)));
2930      crc32x(crc, crc, tmp);
2931      crc32x(crc, crc, tmp3);
2932      ldp(tmp, tmp3, Address(post(buf, 16)));
2933      crc32x(crc, crc, tmp);
2934      crc32x(crc, crc, tmp3);
2935      br(Assembler::GE, CRC_by64_loop);
2936      adds(len, len, 64-4);
2937      br(Assembler::GE, CRC_by4_loop);
2938      adds(len, len, 4);
2939      br(Assembler::GT, CRC_by1_loop);
2940    BIND(L_exit);
2941      ornw(crc, zr, crc);
2942      return;
2943  }
2944
2945    adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
2946    if (offset) add(table0, table0, offset);
2947    add(table1, table0, 1*256*sizeof(juint));
2948    add(table2, table0, 2*256*sizeof(juint));
2949    add(table3, table0, 3*256*sizeof(juint));
2950
2951  if (UseNeon) {
2952      cmp(len, 64);
2953      br(Assembler::LT, L_by16);
2954      eor(v16, T16B, v16, v16);
2955
2956    Label L_fold;
2957
2958      add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
2959
2960      ld1(v0, v1, T2D, post(buf, 32));
2961      ld1r(v4, T2D, post(tmp, 8));
2962      ld1r(v5, T2D, post(tmp, 8));
2963      ld1r(v6, T2D, post(tmp, 8));
2964      ld1r(v7, T2D, post(tmp, 8));
2965      mov(v16, T4S, 0, crc);
2966
2967      eor(v0, T16B, v0, v16);
2968      sub(len, len, 64);
2969
2970    BIND(L_fold);
2971      pmull(v22, T8H, v0, v5, T8B);
2972      pmull(v20, T8H, v0, v7, T8B);
2973      pmull(v23, T8H, v0, v4, T8B);
2974      pmull(v21, T8H, v0, v6, T8B);
2975
2976      pmull2(v18, T8H, v0, v5, T16B);
2977      pmull2(v16, T8H, v0, v7, T16B);
2978      pmull2(v19, T8H, v0, v4, T16B);
2979      pmull2(v17, T8H, v0, v6, T16B);
2980
2981      uzp1(v24, v20, v22, T8H);
2982      uzp2(v25, v20, v22, T8H);
2983      eor(v20, T16B, v24, v25);
2984
2985      uzp1(v26, v16, v18, T8H);
2986      uzp2(v27, v16, v18, T8H);
2987      eor(v16, T16B, v26, v27);
2988
2989      ushll2(v22, T4S, v20, T8H, 8);
2990      ushll(v20, T4S, v20, T4H, 8);
2991
2992      ushll2(v18, T4S, v16, T8H, 8);
2993      ushll(v16, T4S, v16, T4H, 8);
2994
2995      eor(v22, T16B, v23, v22);
2996      eor(v18, T16B, v19, v18);
2997      eor(v20, T16B, v21, v20);
2998      eor(v16, T16B, v17, v16);
2999
3000      uzp1(v17, v16, v20, T2D);
3001      uzp2(v21, v16, v20, T2D);
3002      eor(v17, T16B, v17, v21);
3003
3004      ushll2(v20, T2D, v17, T4S, 16);
3005      ushll(v16, T2D, v17, T2S, 16);
3006
3007      eor(v20, T16B, v20, v22);
3008      eor(v16, T16B, v16, v18);
3009
3010      uzp1(v17, v20, v16, T2D);
3011      uzp2(v21, v20, v16, T2D);
3012      eor(v28, T16B, v17, v21);
3013
3014      pmull(v22, T8H, v1, v5, T8B);
3015      pmull(v20, T8H, v1, v7, T8B);
3016      pmull(v23, T8H, v1, v4, T8B);
3017      pmull(v21, T8H, v1, v6, T8B);
3018
3019      pmull2(v18, T8H, v1, v5, T16B);
3020      pmull2(v16, T8H, v1, v7, T16B);
3021      pmull2(v19, T8H, v1, v4, T16B);
3022      pmull2(v17, T8H, v1, v6, T16B);
3023
3024      ld1(v0, v1, T2D, post(buf, 32));
3025
3026      uzp1(v24, v20, v22, T8H);
3027      uzp2(v25, v20, v22, T8H);
3028      eor(v20, T16B, v24, v25);
3029
3030      uzp1(v26, v16, v18, T8H);
3031      uzp2(v27, v16, v18, T8H);
3032      eor(v16, T16B, v26, v27);
3033
3034      ushll2(v22, T4S, v20, T8H, 8);
3035      ushll(v20, T4S, v20, T4H, 8);
3036
3037      ushll2(v18, T4S, v16, T8H, 8);
3038      ushll(v16, T4S, v16, T4H, 8);
3039
3040      eor(v22, T16B, v23, v22);
3041      eor(v18, T16B, v19, v18);
3042      eor(v20, T16B, v21, v20);
3043      eor(v16, T16B, v17, v16);
3044
3045      uzp1(v17, v16, v20, T2D);
3046      uzp2(v21, v16, v20, T2D);
3047      eor(v16, T16B, v17, v21);
3048
3049      ushll2(v20, T2D, v16, T4S, 16);
3050      ushll(v16, T2D, v16, T2S, 16);
3051
3052      eor(v20, T16B, v22, v20);
3053      eor(v16, T16B, v16, v18);
3054
3055      uzp1(v17, v20, v16, T2D);
3056      uzp2(v21, v20, v16, T2D);
3057      eor(v20, T16B, v17, v21);
3058
3059      shl(v16, T2D, v28, 1);
3060      shl(v17, T2D, v20, 1);
3061
3062      eor(v0, T16B, v0, v16);
3063      eor(v1, T16B, v1, v17);
3064
3065      subs(len, len, 32);
3066      br(Assembler::GE, L_fold);
3067
3068      mov(crc, 0);
3069      mov(tmp, v0, T1D, 0);
3070      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3071      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3072      mov(tmp, v0, T1D, 1);
3073      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3074      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3075      mov(tmp, v1, T1D, 0);
3076      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3077      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3078      mov(tmp, v1, T1D, 1);
3079      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3080      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3081
3082      add(len, len, 32);
3083  }
3084
3085  BIND(L_by16);
3086    subs(len, len, 16);
3087    br(Assembler::GE, L_by16_loop);
3088    adds(len, len, 16-4);
3089    br(Assembler::GE, L_by4_loop);
3090    adds(len, len, 4);
3091    br(Assembler::GT, L_by1_loop);
3092    b(L_exit);
3093
3094  BIND(L_by4_loop);
3095    ldrw(tmp, Address(post(buf, 4)));
3096    update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3097    subs(len, len, 4);
3098    br(Assembler::GE, L_by4_loop);
3099    adds(len, len, 4);
3100    br(Assembler::LE, L_exit);
3101  BIND(L_by1_loop);
3102    subs(len, len, 1);
3103    ldrb(tmp, Address(post(buf, 1)));
3104    update_byte_crc32(crc, tmp, table0);
3105    br(Assembler::GT, L_by1_loop);
3106    b(L_exit);
3107
3108    align(CodeEntryAlignment);
3109  BIND(L_by16_loop);
3110    subs(len, len, 16);
3111    ldp(tmp, tmp3, Address(post(buf, 16)));
3112    update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3113    update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3114    update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3115    update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3116    br(Assembler::GE, L_by16_loop);
3117    adds(len, len, 16-4);
3118    br(Assembler::GE, L_by4_loop);
3119    adds(len, len, 4);
3120    br(Assembler::GT, L_by1_loop);
3121  BIND(L_exit);
3122    ornw(crc, zr, crc);
3123}
3124
3125/**
3126 * @param crc   register containing existing CRC (32-bit)
3127 * @param buf   register pointing to input byte buffer (byte*)
3128 * @param len   register containing number of bytes
3129 * @param table register that will contain address of CRC table
3130 * @param tmp   scratch register
3131 */
3132void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3133        Register table0, Register table1, Register table2, Register table3,
3134        Register tmp, Register tmp2, Register tmp3) {
3135  Label L_exit;
3136  Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop;
3137
3138    subs(len, len, 64);
3139    br(Assembler::GE, CRC_by64_loop);
3140    adds(len, len, 64-4);
3141    br(Assembler::GE, CRC_by4_loop);
3142    adds(len, len, 4);
3143    br(Assembler::GT, CRC_by1_loop);
3144    b(L_exit);
3145
3146  BIND(CRC_by4_loop);
3147    ldrw(tmp, Address(post(buf, 4)));
3148    subs(len, len, 4);
3149    crc32cw(crc, crc, tmp);
3150    br(Assembler::GE, CRC_by4_loop);
3151    adds(len, len, 4);
3152    br(Assembler::LE, L_exit);
3153  BIND(CRC_by1_loop);
3154    ldrb(tmp, Address(post(buf, 1)));
3155    subs(len, len, 1);
3156    crc32cb(crc, crc, tmp);
3157    br(Assembler::GT, CRC_by1_loop);
3158    b(L_exit);
3159
3160    align(CodeEntryAlignment);
3161  BIND(CRC_by64_loop);
3162    subs(len, len, 64);
3163    ldp(tmp, tmp3, Address(post(buf, 16)));
3164    crc32cx(crc, crc, tmp);
3165    crc32cx(crc, crc, tmp3);
3166    ldp(tmp, tmp3, Address(post(buf, 16)));
3167    crc32cx(crc, crc, tmp);
3168    crc32cx(crc, crc, tmp3);
3169    ldp(tmp, tmp3, Address(post(buf, 16)));
3170    crc32cx(crc, crc, tmp);
3171    crc32cx(crc, crc, tmp3);
3172    ldp(tmp, tmp3, Address(post(buf, 16)));
3173    crc32cx(crc, crc, tmp);
3174    crc32cx(crc, crc, tmp3);
3175    br(Assembler::GE, CRC_by64_loop);
3176    adds(len, len, 64-4);
3177    br(Assembler::GE, CRC_by4_loop);
3178    adds(len, len, 4);
3179    br(Assembler::GT, CRC_by1_loop);
3180  BIND(L_exit);
3181    return;
3182}
3183
3184SkipIfEqual::SkipIfEqual(
3185    MacroAssembler* masm, const bool* flag_addr, bool value) {
3186  _masm = masm;
3187  unsigned long offset;
3188  _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3189  _masm->ldrb(rscratch1, Address(rscratch1, offset));
3190  _masm->cbzw(rscratch1, _label);
3191}
3192
3193SkipIfEqual::~SkipIfEqual() {
3194  _masm->bind(_label);
3195}
3196
3197void MacroAssembler::addptr(const Address &dst, int32_t src) {
3198  Address adr;
3199  switch(dst.getMode()) {
3200  case Address::base_plus_offset:
3201    // This is the expected mode, although we allow all the other
3202    // forms below.
3203    adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3204    break;
3205  default:
3206    lea(rscratch2, dst);
3207    adr = Address(rscratch2);
3208    break;
3209  }
3210  ldr(rscratch1, adr);
3211  add(rscratch1, rscratch1, src);
3212  str(rscratch1, adr);
3213}
3214
3215void MacroAssembler::cmpptr(Register src1, Address src2) {
3216  unsigned long offset;
3217  adrp(rscratch1, src2, offset);
3218  ldr(rscratch1, Address(rscratch1, offset));
3219  cmp(src1, rscratch1);
3220}
3221
3222void MacroAssembler::store_check(Register obj, Address dst) {
3223  store_check(obj);
3224}
3225
3226void MacroAssembler::store_check(Register obj) {
3227  // Does a store check for the oop in register obj. The content of
3228  // register obj is destroyed afterwards.
3229
3230  BarrierSet* bs = Universe::heap()->barrier_set();
3231  assert(bs->kind() == BarrierSet::CardTableForRS ||
3232         bs->kind() == BarrierSet::CardTableExtension,
3233         "Wrong barrier set kind");
3234
3235  CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
3236  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3237
3238  lsr(obj, obj, CardTableModRefBS::card_shift);
3239
3240  assert(CardTableModRefBS::dirty_card_val() == 0, "must be");
3241
3242  load_byte_map_base(rscratch1);
3243
3244  if (UseCondCardMark) {
3245    Label L_already_dirty;
3246    membar(StoreLoad);
3247    ldrb(rscratch2,  Address(obj, rscratch1));
3248    cbz(rscratch2, L_already_dirty);
3249    strb(zr, Address(obj, rscratch1));
3250    bind(L_already_dirty);
3251  } else {
3252    if (UseConcMarkSweepGC && CMSPrecleaningEnabled) {
3253      membar(StoreStore);
3254    }
3255    strb(zr, Address(obj, rscratch1));
3256  }
3257}
3258
3259void MacroAssembler::load_klass(Register dst, Register src) {
3260  if (UseCompressedClassPointers) {
3261    ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3262    decode_klass_not_null(dst);
3263  } else {
3264    ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3265  }
3266}
3267
3268void MacroAssembler::load_mirror(Register dst, Register method) {
3269  const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3270  ldr(dst, Address(rmethod, Method::const_offset()));
3271  ldr(dst, Address(dst, ConstMethod::constants_offset()));
3272  ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3273  ldr(dst, Address(dst, mirror_offset));
3274}
3275
3276void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3277  if (UseCompressedClassPointers) {
3278    ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3279    if (Universe::narrow_klass_base() == NULL) {
3280      cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3281      return;
3282    } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3283               && Universe::narrow_klass_shift() == 0) {
3284      // Only the bottom 32 bits matter
3285      cmpw(trial_klass, tmp);
3286      return;
3287    }
3288    decode_klass_not_null(tmp);
3289  } else {
3290    ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3291  }
3292  cmp(trial_klass, tmp);
3293}
3294
3295void MacroAssembler::load_prototype_header(Register dst, Register src) {
3296  load_klass(dst, src);
3297  ldr(dst, Address(dst, Klass::prototype_header_offset()));
3298}
3299
3300void MacroAssembler::store_klass(Register dst, Register src) {
3301  // FIXME: Should this be a store release?  concurrent gcs assumes
3302  // klass length is valid if klass field is not null.
3303  if (UseCompressedClassPointers) {
3304    encode_klass_not_null(src);
3305    strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3306  } else {
3307    str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3308  }
3309}
3310
3311void MacroAssembler::store_klass_gap(Register dst, Register src) {
3312  if (UseCompressedClassPointers) {
3313    // Store to klass gap in destination
3314    strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3315  }
3316}
3317
3318// Algorithm must match oop.inline.hpp encode_heap_oop.
3319void MacroAssembler::encode_heap_oop(Register d, Register s) {
3320#ifdef ASSERT
3321  verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3322#endif
3323  verify_oop(s, "broken oop in encode_heap_oop");
3324  if (Universe::narrow_oop_base() == NULL) {
3325    if (Universe::narrow_oop_shift() != 0) {
3326      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3327      lsr(d, s, LogMinObjAlignmentInBytes);
3328    } else {
3329      mov(d, s);
3330    }
3331  } else {
3332    subs(d, s, rheapbase);
3333    csel(d, d, zr, Assembler::HS);
3334    lsr(d, d, LogMinObjAlignmentInBytes);
3335
3336    /*  Old algorithm: is this any worse?
3337    Label nonnull;
3338    cbnz(r, nonnull);
3339    sub(r, r, rheapbase);
3340    bind(nonnull);
3341    lsr(r, r, LogMinObjAlignmentInBytes);
3342    */
3343  }
3344}
3345
3346void MacroAssembler::encode_heap_oop_not_null(Register r) {
3347#ifdef ASSERT
3348  verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3349  if (CheckCompressedOops) {
3350    Label ok;
3351    cbnz(r, ok);
3352    stop("null oop passed to encode_heap_oop_not_null");
3353    bind(ok);
3354  }
3355#endif
3356  verify_oop(r, "broken oop in encode_heap_oop_not_null");
3357  if (Universe::narrow_oop_base() != NULL) {
3358    sub(r, r, rheapbase);
3359  }
3360  if (Universe::narrow_oop_shift() != 0) {
3361    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3362    lsr(r, r, LogMinObjAlignmentInBytes);
3363  }
3364}
3365
3366void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3367#ifdef ASSERT
3368  verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3369  if (CheckCompressedOops) {
3370    Label ok;
3371    cbnz(src, ok);
3372    stop("null oop passed to encode_heap_oop_not_null2");
3373    bind(ok);
3374  }
3375#endif
3376  verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3377
3378  Register data = src;
3379  if (Universe::narrow_oop_base() != NULL) {
3380    sub(dst, src, rheapbase);
3381    data = dst;
3382  }
3383  if (Universe::narrow_oop_shift() != 0) {
3384    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3385    lsr(dst, data, LogMinObjAlignmentInBytes);
3386    data = dst;
3387  }
3388  if (data == src)
3389    mov(dst, src);
3390}
3391
3392void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3393#ifdef ASSERT
3394  verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3395#endif
3396  if (Universe::narrow_oop_base() == NULL) {
3397    if (Universe::narrow_oop_shift() != 0 || d != s) {
3398      lsl(d, s, Universe::narrow_oop_shift());
3399    }
3400  } else {
3401    Label done;
3402    if (d != s)
3403      mov(d, s);
3404    cbz(s, done);
3405    add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3406    bind(done);
3407  }
3408  verify_oop(d, "broken oop in decode_heap_oop");
3409}
3410
3411void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3412  assert (UseCompressedOops, "should only be used for compressed headers");
3413  assert (Universe::heap() != NULL, "java heap should be initialized");
3414  // Cannot assert, unverified entry point counts instructions (see .ad file)
3415  // vtableStubs also counts instructions in pd_code_size_limit.
3416  // Also do not verify_oop as this is called by verify_oop.
3417  if (Universe::narrow_oop_shift() != 0) {
3418    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3419    if (Universe::narrow_oop_base() != NULL) {
3420      add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3421    } else {
3422      add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3423    }
3424  } else {
3425    assert (Universe::narrow_oop_base() == NULL, "sanity");
3426  }
3427}
3428
3429void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3430  assert (UseCompressedOops, "should only be used for compressed headers");
3431  assert (Universe::heap() != NULL, "java heap should be initialized");
3432  // Cannot assert, unverified entry point counts instructions (see .ad file)
3433  // vtableStubs also counts instructions in pd_code_size_limit.
3434  // Also do not verify_oop as this is called by verify_oop.
3435  if (Universe::narrow_oop_shift() != 0) {
3436    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3437    if (Universe::narrow_oop_base() != NULL) {
3438      add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3439    } else {
3440      add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3441    }
3442  } else {
3443    assert (Universe::narrow_oop_base() == NULL, "sanity");
3444    if (dst != src) {
3445      mov(dst, src);
3446    }
3447  }
3448}
3449
3450void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3451  if (Universe::narrow_klass_base() == NULL) {
3452    if (Universe::narrow_klass_shift() != 0) {
3453      assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3454      lsr(dst, src, LogKlassAlignmentInBytes);
3455    } else {
3456      if (dst != src) mov(dst, src);
3457    }
3458    return;
3459  }
3460
3461  if (use_XOR_for_compressed_class_base) {
3462    if (Universe::narrow_klass_shift() != 0) {
3463      eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3464      lsr(dst, dst, LogKlassAlignmentInBytes);
3465    } else {
3466      eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3467    }
3468    return;
3469  }
3470
3471  if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3472      && Universe::narrow_klass_shift() == 0) {
3473    movw(dst, src);
3474    return;
3475  }
3476
3477#ifdef ASSERT
3478  verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3479#endif
3480
3481  Register rbase = dst;
3482  if (dst == src) rbase = rheapbase;
3483  mov(rbase, (uint64_t)Universe::narrow_klass_base());
3484  sub(dst, src, rbase);
3485  if (Universe::narrow_klass_shift() != 0) {
3486    assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3487    lsr(dst, dst, LogKlassAlignmentInBytes);
3488  }
3489  if (dst == src) reinit_heapbase();
3490}
3491
3492void MacroAssembler::encode_klass_not_null(Register r) {
3493  encode_klass_not_null(r, r);
3494}
3495
3496void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3497  Register rbase = dst;
3498  assert (UseCompressedClassPointers, "should only be used for compressed headers");
3499
3500  if (Universe::narrow_klass_base() == NULL) {
3501    if (Universe::narrow_klass_shift() != 0) {
3502      assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3503      lsl(dst, src, LogKlassAlignmentInBytes);
3504    } else {
3505      if (dst != src) mov(dst, src);
3506    }
3507    return;
3508  }
3509
3510  if (use_XOR_for_compressed_class_base) {
3511    if (Universe::narrow_klass_shift() != 0) {
3512      lsl(dst, src, LogKlassAlignmentInBytes);
3513      eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3514    } else {
3515      eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3516    }
3517    return;
3518  }
3519
3520  if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3521      && Universe::narrow_klass_shift() == 0) {
3522    if (dst != src)
3523      movw(dst, src);
3524    movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3525    return;
3526  }
3527
3528  // Cannot assert, unverified entry point counts instructions (see .ad file)
3529  // vtableStubs also counts instructions in pd_code_size_limit.
3530  // Also do not verify_oop as this is called by verify_oop.
3531  if (dst == src) rbase = rheapbase;
3532  mov(rbase, (uint64_t)Universe::narrow_klass_base());
3533  if (Universe::narrow_klass_shift() != 0) {
3534    assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3535    add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3536  } else {
3537    add(dst, rbase, src);
3538  }
3539  if (dst == src) reinit_heapbase();
3540}
3541
3542void  MacroAssembler::decode_klass_not_null(Register r) {
3543  decode_klass_not_null(r, r);
3544}
3545
3546void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3547  assert (UseCompressedOops, "should only be used for compressed oops");
3548  assert (Universe::heap() != NULL, "java heap should be initialized");
3549  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3550
3551  int oop_index = oop_recorder()->find_index(obj);
3552  assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3553
3554  InstructionMark im(this);
3555  RelocationHolder rspec = oop_Relocation::spec(oop_index);
3556  code_section()->relocate(inst_mark(), rspec);
3557  movz(dst, 0xDEAD, 16);
3558  movk(dst, 0xBEEF);
3559}
3560
3561void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3562  assert (UseCompressedClassPointers, "should only be used for compressed headers");
3563  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3564  int index = oop_recorder()->find_index(k);
3565  assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3566
3567  InstructionMark im(this);
3568  RelocationHolder rspec = metadata_Relocation::spec(index);
3569  code_section()->relocate(inst_mark(), rspec);
3570  narrowKlass nk = Klass::encode_klass(k);
3571  movz(dst, (nk >> 16), 16);
3572  movk(dst, nk & 0xffff);
3573}
3574
3575void MacroAssembler::load_heap_oop(Register dst, Address src)
3576{
3577  if (UseCompressedOops) {
3578    ldrw(dst, src);
3579    decode_heap_oop(dst);
3580  } else {
3581    ldr(dst, src);
3582  }
3583}
3584
3585void MacroAssembler::load_heap_oop_not_null(Register dst, Address src)
3586{
3587  if (UseCompressedOops) {
3588    ldrw(dst, src);
3589    decode_heap_oop_not_null(dst);
3590  } else {
3591    ldr(dst, src);
3592  }
3593}
3594
3595void MacroAssembler::store_heap_oop(Address dst, Register src) {
3596  if (UseCompressedOops) {
3597    assert(!dst.uses(src), "not enough registers");
3598    encode_heap_oop(src);
3599    strw(src, dst);
3600  } else
3601    str(src, dst);
3602}
3603
3604// Used for storing NULLs.
3605void MacroAssembler::store_heap_oop_null(Address dst) {
3606  if (UseCompressedOops) {
3607    strw(zr, dst);
3608  } else
3609    str(zr, dst);
3610}
3611
3612#if INCLUDE_ALL_GCS
3613void MacroAssembler::g1_write_barrier_pre(Register obj,
3614                                          Register pre_val,
3615                                          Register thread,
3616                                          Register tmp,
3617                                          bool tosca_live,
3618                                          bool expand_call) {
3619  // If expand_call is true then we expand the call_VM_leaf macro
3620  // directly to skip generating the check by
3621  // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
3622
3623  assert(thread == rthread, "must be");
3624
3625  Label done;
3626  Label runtime;
3627
3628  assert(pre_val != noreg, "check this code");
3629
3630  if (obj != noreg)
3631    assert_different_registers(obj, pre_val, tmp);
3632
3633  Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3634                                       SATBMarkQueue::byte_offset_of_active()));
3635  Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3636                                       SATBMarkQueue::byte_offset_of_index()));
3637  Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3638                                       SATBMarkQueue::byte_offset_of_buf()));
3639
3640
3641  // Is marking active?
3642  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
3643    ldrw(tmp, in_progress);
3644  } else {
3645    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
3646    ldrb(tmp, in_progress);
3647  }
3648  cbzw(tmp, done);
3649
3650  // Do we need to load the previous value?
3651  if (obj != noreg) {
3652    load_heap_oop(pre_val, Address(obj, 0));
3653  }
3654
3655  // Is the previous value null?
3656  cbz(pre_val, done);
3657
3658  // Can we store original value in the thread's buffer?
3659  // Is index == 0?
3660  // (The index field is typed as size_t.)
3661
3662  ldr(tmp, index);                      // tmp := *index_adr
3663  cbz(tmp, runtime);                    // tmp == 0?
3664                                        // If yes, goto runtime
3665
3666  sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
3667  str(tmp, index);                      // *index_adr := tmp
3668  ldr(rscratch1, buffer);
3669  add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr
3670
3671  // Record the previous value
3672  str(pre_val, Address(tmp, 0));
3673  b(done);
3674
3675  bind(runtime);
3676  // save the live input values
3677  push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3678
3679  // Calling the runtime using the regular call_VM_leaf mechanism generates
3680  // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
3681  // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL.
3682  //
3683  // If we care generating the pre-barrier without a frame (e.g. in the
3684  // intrinsified Reference.get() routine) then ebp might be pointing to
3685  // the caller frame and so this check will most likely fail at runtime.
3686  //
3687  // Expanding the call directly bypasses the generation of the check.
3688  // So when we do not have have a full interpreter frame on the stack
3689  // expand_call should be passed true.
3690
3691  if (expand_call) {
3692    assert(pre_val != c_rarg1, "smashed arg");
3693    pass_arg1(this, thread);
3694    pass_arg0(this, pre_val);
3695    MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
3696  } else {
3697    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
3698  }
3699
3700  pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3701
3702  bind(done);
3703}
3704
3705void MacroAssembler::g1_write_barrier_post(Register store_addr,
3706                                           Register new_val,
3707                                           Register thread,
3708                                           Register tmp,
3709                                           Register tmp2) {
3710  assert(thread == rthread, "must be");
3711
3712  Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3713                                       DirtyCardQueue::byte_offset_of_index()));
3714  Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3715                                       DirtyCardQueue::byte_offset_of_buf()));
3716
3717  BarrierSet* bs = Universe::heap()->barrier_set();
3718  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
3719  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3720
3721  Label done;
3722  Label runtime;
3723
3724  // Does store cross heap regions?
3725
3726  eor(tmp, store_addr, new_val);
3727  lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
3728  cbz(tmp, done);
3729
3730  // crosses regions, storing NULL?
3731
3732  cbz(new_val, done);
3733
3734  // storing region crossing non-NULL, is card already dirty?
3735
3736  ExternalAddress cardtable((address) ct->byte_map_base);
3737  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3738  const Register card_addr = tmp;
3739
3740  lsr(card_addr, store_addr, CardTableModRefBS::card_shift);
3741
3742  // get the address of the card
3743  load_byte_map_base(tmp2);
3744  add(card_addr, card_addr, tmp2);
3745  ldrb(tmp2, Address(card_addr));
3746  cmpw(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
3747  br(Assembler::EQ, done);
3748
3749  assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0");
3750
3751  membar(Assembler::StoreLoad);
3752
3753  ldrb(tmp2, Address(card_addr));
3754  cbzw(tmp2, done);
3755
3756  // storing a region crossing, non-NULL oop, card is clean.
3757  // dirty card and log.
3758
3759  strb(zr, Address(card_addr));
3760
3761  ldr(rscratch1, queue_index);
3762  cbz(rscratch1, runtime);
3763  sub(rscratch1, rscratch1, wordSize);
3764  str(rscratch1, queue_index);
3765
3766  ldr(tmp2, buffer);
3767  str(card_addr, Address(tmp2, rscratch1));
3768  b(done);
3769
3770  bind(runtime);
3771  // save the live input values
3772  push(store_addr->bit(true) | new_val->bit(true), sp);
3773  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
3774  pop(store_addr->bit(true) | new_val->bit(true), sp);
3775
3776  bind(done);
3777}
3778
3779#endif // INCLUDE_ALL_GCS
3780
3781Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
3782  assert(oop_recorder() != NULL, "this assembler needs a Recorder");
3783  int index = oop_recorder()->allocate_metadata_index(obj);
3784  RelocationHolder rspec = metadata_Relocation::spec(index);
3785  return Address((address)obj, rspec);
3786}
3787
3788// Move an oop into a register.  immediate is true if we want
3789// immediate instrcutions, i.e. we are not going to patch this
3790// instruction while the code is being executed by another thread.  In
3791// that case we can use move immediates rather than the constant pool.
3792void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
3793  int oop_index;
3794  if (obj == NULL) {
3795    oop_index = oop_recorder()->allocate_oop_index(obj);
3796  } else {
3797    oop_index = oop_recorder()->find_index(obj);
3798    assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3799  }
3800  RelocationHolder rspec = oop_Relocation::spec(oop_index);
3801  if (! immediate) {
3802    address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3803    ldr_constant(dst, Address(dummy, rspec));
3804  } else
3805    mov(dst, Address((address)obj, rspec));
3806}
3807
3808// Move a metadata address into a register.
3809void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3810  int oop_index;
3811  if (obj == NULL) {
3812    oop_index = oop_recorder()->allocate_metadata_index(obj);
3813  } else {
3814    oop_index = oop_recorder()->find_index(obj);
3815  }
3816  RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3817  mov(dst, Address((address)obj, rspec));
3818}
3819
3820Address MacroAssembler::constant_oop_address(jobject obj) {
3821  assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
3822  assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
3823  int oop_index = oop_recorder()->find_index(obj);
3824  return Address((address)obj, oop_Relocation::spec(oop_index));
3825}
3826
3827// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3828void MacroAssembler::tlab_allocate(Register obj,
3829                                   Register var_size_in_bytes,
3830                                   int con_size_in_bytes,
3831                                   Register t1,
3832                                   Register t2,
3833                                   Label& slow_case) {
3834  assert_different_registers(obj, t2);
3835  assert_different_registers(obj, var_size_in_bytes);
3836  Register end = t2;
3837
3838  // verify_tlab();
3839
3840  ldr(obj, Address(rthread, JavaThread::tlab_top_offset()));
3841  if (var_size_in_bytes == noreg) {
3842    lea(end, Address(obj, con_size_in_bytes));
3843  } else {
3844    lea(end, Address(obj, var_size_in_bytes));
3845  }
3846  ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset()));
3847  cmp(end, rscratch1);
3848  br(Assembler::HI, slow_case);
3849
3850  // update the tlab top pointer
3851  str(end, Address(rthread, JavaThread::tlab_top_offset()));
3852
3853  // recover var_size_in_bytes if necessary
3854  if (var_size_in_bytes == end) {
3855    sub(var_size_in_bytes, var_size_in_bytes, obj);
3856  }
3857  // verify_tlab();
3858}
3859
3860// Preserves r19, and r3.
3861Register MacroAssembler::tlab_refill(Label& retry,
3862                                     Label& try_eden,
3863                                     Label& slow_case) {
3864  Register top = r0;
3865  Register t1  = r2;
3866  Register t2  = r4;
3867  assert_different_registers(top, rthread, t1, t2, /* preserve: */ r19, r3);
3868  Label do_refill, discard_tlab;
3869
3870  if (!Universe::heap()->supports_inline_contig_alloc()) {
3871    // No allocation in the shared eden.
3872    b(slow_case);
3873  }
3874
3875  ldr(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3876  ldr(t1,  Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3877
3878  // calculate amount of free space
3879  sub(t1, t1, top);
3880  lsr(t1, t1, LogHeapWordSize);
3881
3882  // Retain tlab and allocate object in shared space if
3883  // the amount free in the tlab is too large to discard.
3884
3885  ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3886  cmp(t1, rscratch1);
3887  br(Assembler::LE, discard_tlab);
3888
3889  // Retain
3890  // ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3891  mov(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
3892  add(rscratch1, rscratch1, t2);
3893  str(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3894
3895  if (TLABStats) {
3896    // increment number of slow_allocations
3897    addmw(Address(rthread, in_bytes(JavaThread::tlab_slow_allocations_offset())),
3898         1, rscratch1);
3899  }
3900  b(try_eden);
3901
3902  bind(discard_tlab);
3903  if (TLABStats) {
3904    // increment number of refills
3905    addmw(Address(rthread, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1,
3906         rscratch1);
3907    // accumulate wastage -- t1 is amount free in tlab
3908    addmw(Address(rthread, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1,
3909         rscratch1);
3910  }
3911
3912  // if tlab is currently allocated (top or end != null) then
3913  // fill [top, end + alignment_reserve) with array object
3914  cbz(top, do_refill);
3915
3916  // set up the mark word
3917  mov(rscratch1, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
3918  str(rscratch1, Address(top, oopDesc::mark_offset_in_bytes()));
3919  // set the length to the remaining space
3920  sub(t1, t1, typeArrayOopDesc::header_size(T_INT));
3921  add(t1, t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
3922  lsl(t1, t1, log2_intptr(HeapWordSize/sizeof(jint)));
3923  strw(t1, Address(top, arrayOopDesc::length_offset_in_bytes()));
3924  // set klass to intArrayKlass
3925  {
3926    unsigned long offset;
3927    // dubious reloc why not an oop reloc?
3928    adrp(rscratch1, ExternalAddress((address)Universe::intArrayKlassObj_addr()),
3929         offset);
3930    ldr(t1, Address(rscratch1, offset));
3931  }
3932  // store klass last.  concurrent gcs assumes klass length is valid if
3933  // klass field is not null.
3934  store_klass(top, t1);
3935
3936  mov(t1, top);
3937  ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
3938  sub(t1, t1, rscratch1);
3939  incr_allocated_bytes(rthread, t1, 0, rscratch1);
3940
3941  // refill the tlab with an eden allocation
3942  bind(do_refill);
3943  ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
3944  lsl(t1, t1, LogHeapWordSize);
3945  // allocate new tlab, address returned in top
3946  eden_allocate(top, t1, 0, t2, slow_case);
3947
3948  // Check that t1 was preserved in eden_allocate.
3949#ifdef ASSERT
3950  if (UseTLAB) {
3951    Label ok;
3952    Register tsize = r4;
3953    assert_different_registers(tsize, rthread, t1);
3954    str(tsize, Address(pre(sp, -16)));
3955    ldr(tsize, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
3956    lsl(tsize, tsize, LogHeapWordSize);
3957    cmp(t1, tsize);
3958    br(Assembler::EQ, ok);
3959    STOP("assert(t1 != tlab size)");
3960    should_not_reach_here();
3961
3962    bind(ok);
3963    ldr(tsize, Address(post(sp, 16)));
3964  }
3965#endif
3966  str(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
3967  str(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3968  add(top, top, t1);
3969  sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
3970  str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3971
3972  if (ZeroTLAB) {
3973    // This is a fast TLAB refill, therefore the GC is not notified of it.
3974    // So compiled code must fill the new TLAB with zeroes.
3975    ldr(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
3976    zero_memory(top,t1,t2);
3977  }
3978
3979  verify_tlab();
3980  b(retry);
3981
3982  return rthread; // for use by caller
3983}
3984
3985// Zero words; len is in bytes
3986// Destroys all registers except addr
3987// len must be a nonzero multiple of wordSize
3988void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
3989  assert_different_registers(addr, len, t1, rscratch1, rscratch2);
3990
3991#ifdef ASSERT
3992  { Label L;
3993    tst(len, BytesPerWord - 1);
3994    br(Assembler::EQ, L);
3995    stop("len is not a multiple of BytesPerWord");
3996    bind(L);
3997  }
3998#endif
3999
4000#ifndef PRODUCT
4001  block_comment("zero memory");
4002#endif
4003
4004  Label loop;
4005  Label entry;
4006
4007//  Algorithm:
4008//
4009//    scratch1 = cnt & 7;
4010//    cnt -= scratch1;
4011//    p += scratch1;
4012//    switch (scratch1) {
4013//      do {
4014//        cnt -= 8;
4015//          p[-8] = 0;
4016//        case 7:
4017//          p[-7] = 0;
4018//        case 6:
4019//          p[-6] = 0;
4020//          // ...
4021//        case 1:
4022//          p[-1] = 0;
4023//        case 0:
4024//          p += 8;
4025//      } while (cnt);
4026//    }
4027
4028  const int unroll = 8; // Number of str(zr) instructions we'll unroll
4029
4030  lsr(len, len, LogBytesPerWord);
4031  andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4032  sub(len, len, rscratch1);      // cnt -= unroll
4033  // t1 always points to the end of the region we're about to zero
4034  add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4035  adr(rscratch2, entry);
4036  sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4037  br(rscratch2);
4038  bind(loop);
4039  sub(len, len, unroll);
4040  for (int i = -unroll; i < 0; i++)
4041    str(zr, Address(t1, i * wordSize));
4042  bind(entry);
4043  add(t1, t1, unroll * wordSize);
4044  cbnz(len, loop);
4045}
4046
4047// Defines obj, preserves var_size_in_bytes
4048void MacroAssembler::eden_allocate(Register obj,
4049                                   Register var_size_in_bytes,
4050                                   int con_size_in_bytes,
4051                                   Register t1,
4052                                   Label& slow_case) {
4053  assert_different_registers(obj, var_size_in_bytes, t1);
4054  if (!Universe::heap()->supports_inline_contig_alloc()) {
4055    b(slow_case);
4056  } else {
4057    Register end = t1;
4058    Register heap_end = rscratch2;
4059    Label retry;
4060    bind(retry);
4061    {
4062      unsigned long offset;
4063      adrp(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()), offset);
4064      ldr(heap_end, Address(rscratch1, offset));
4065    }
4066
4067    ExternalAddress heap_top((address) Universe::heap()->top_addr());
4068
4069    // Get the current top of the heap
4070    {
4071      unsigned long offset;
4072      adrp(rscratch1, heap_top, offset);
4073      // Use add() here after ARDP, rather than lea().
4074      // lea() does not generate anything if its offset is zero.
4075      // However, relocs expect to find either an ADD or a load/store
4076      // insn after an ADRP.  add() always generates an ADD insn, even
4077      // for add(Rn, Rn, 0).
4078      add(rscratch1, rscratch1, offset);
4079      ldaxr(obj, rscratch1);
4080    }
4081
4082    // Adjust it my the size of our new object
4083    if (var_size_in_bytes == noreg) {
4084      lea(end, Address(obj, con_size_in_bytes));
4085    } else {
4086      lea(end, Address(obj, var_size_in_bytes));
4087    }
4088
4089    // if end < obj then we wrapped around high memory
4090    cmp(end, obj);
4091    br(Assembler::LO, slow_case);
4092
4093    cmp(end, heap_end);
4094    br(Assembler::HI, slow_case);
4095
4096    // If heap_top hasn't been changed by some other thread, update it.
4097    stlxr(rscratch2, end, rscratch1);
4098    cbnzw(rscratch2, retry);
4099  }
4100}
4101
4102void MacroAssembler::verify_tlab() {
4103#ifdef ASSERT
4104  if (UseTLAB && VerifyOops) {
4105    Label next, ok;
4106
4107    stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4108
4109    ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4110    ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4111    cmp(rscratch2, rscratch1);
4112    br(Assembler::HS, next);
4113    STOP("assert(top >= start)");
4114    should_not_reach_here();
4115
4116    bind(next);
4117    ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4118    ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4119    cmp(rscratch2, rscratch1);
4120    br(Assembler::HS, ok);
4121    STOP("assert(top <= end)");
4122    should_not_reach_here();
4123
4124    bind(ok);
4125    ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4126  }
4127#endif
4128}
4129
4130// Writes to stack successive pages until offset reached to check for
4131// stack overflow + shadow pages.  This clobbers tmp.
4132void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4133  assert_different_registers(tmp, size, rscratch1);
4134  mov(tmp, sp);
4135  // Bang stack for total size given plus shadow page size.
4136  // Bang one page at a time because large size can bang beyond yellow and
4137  // red zones.
4138  Label loop;
4139  mov(rscratch1, os::vm_page_size());
4140  bind(loop);
4141  lea(tmp, Address(tmp, -os::vm_page_size()));
4142  subsw(size, size, rscratch1);
4143  str(size, Address(tmp));
4144  br(Assembler::GT, loop);
4145
4146  // Bang down shadow pages too.
4147  // At this point, (tmp-0) is the last address touched, so don't
4148  // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4149  // was post-decremented.)  Skip this address by starting at i=1, and
4150  // touch a few more pages below.  N.B.  It is important to touch all
4151  // the way down to and including i=StackShadowPages.
4152  for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4153    // this could be any sized move but this is can be a debugging crumb
4154    // so the bigger the better.
4155    lea(tmp, Address(tmp, -os::vm_page_size()));
4156    str(size, Address(tmp));
4157  }
4158}
4159
4160
4161address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4162  unsigned long off;
4163  adrp(r, Address(page, rtype), off);
4164  InstructionMark im(this);
4165  code_section()->relocate(inst_mark(), rtype);
4166  ldrw(zr, Address(r, off));
4167  return inst_mark();
4168}
4169
4170address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4171  InstructionMark im(this);
4172  code_section()->relocate(inst_mark(), rtype);
4173  ldrw(zr, Address(r, 0));
4174  return inst_mark();
4175}
4176
4177void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4178  relocInfo::relocType rtype = dest.rspec().reloc()->type();
4179  unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4180  unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4181  unsigned long dest_page = (unsigned long)dest.target() >> 12;
4182  long offset_low = dest_page - low_page;
4183  long offset_high = dest_page - high_page;
4184
4185  assert(is_valid_AArch64_address(dest.target()), "bad address");
4186  assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4187
4188  InstructionMark im(this);
4189  code_section()->relocate(inst_mark(), dest.rspec());
4190  // 8143067: Ensure that the adrp can reach the dest from anywhere within
4191  // the code cache so that if it is relocated we know it will still reach
4192  if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4193    _adrp(reg1, dest.target());
4194  } else {
4195    unsigned long target = (unsigned long)dest.target();
4196    unsigned long adrp_target
4197      = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4198
4199    _adrp(reg1, (address)adrp_target);
4200    movk(reg1, target >> 32, 32);
4201  }
4202  byte_offset = (unsigned long)dest.target() & 0xfff;
4203}
4204
4205void MacroAssembler::load_byte_map_base(Register reg) {
4206  jbyte *byte_map_base =
4207    ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base;
4208
4209  if (is_valid_AArch64_address((address)byte_map_base)) {
4210    // Strictly speaking the byte_map_base isn't an address at all,
4211    // and it might even be negative.
4212    unsigned long offset;
4213    adrp(reg, ExternalAddress((address)byte_map_base), offset);
4214    // We expect offset to be zero with most collectors.
4215    if (offset != 0) {
4216      add(reg, reg, offset);
4217    }
4218  } else {
4219    mov(reg, (uint64_t)byte_map_base);
4220  }
4221}
4222
4223void MacroAssembler::build_frame(int framesize) {
4224  assert(framesize > 0, "framesize must be > 0");
4225  if (framesize < ((1 << 9) + 2 * wordSize)) {
4226    sub(sp, sp, framesize);
4227    stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4228    if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4229  } else {
4230    stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4231    if (PreserveFramePointer) mov(rfp, sp);
4232    if (framesize < ((1 << 12) + 2 * wordSize))
4233      sub(sp, sp, framesize - 2 * wordSize);
4234    else {
4235      mov(rscratch1, framesize - 2 * wordSize);
4236      sub(sp, sp, rscratch1);
4237    }
4238  }
4239}
4240
4241void MacroAssembler::remove_frame(int framesize) {
4242  assert(framesize > 0, "framesize must be > 0");
4243  if (framesize < ((1 << 9) + 2 * wordSize)) {
4244    ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4245    add(sp, sp, framesize);
4246  } else {
4247    if (framesize < ((1 << 12) + 2 * wordSize))
4248      add(sp, sp, framesize - 2 * wordSize);
4249    else {
4250      mov(rscratch1, framesize - 2 * wordSize);
4251      add(sp, sp, rscratch1);
4252    }
4253    ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4254  }
4255}
4256
4257typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4258
4259// Search for str1 in str2 and return index or -1
4260void MacroAssembler::string_indexof(Register str2, Register str1,
4261                                    Register cnt2, Register cnt1,
4262                                    Register tmp1, Register tmp2,
4263                                    Register tmp3, Register tmp4,
4264                                    int icnt1, Register result, int ae) {
4265  Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH;
4266
4267  Register ch1 = rscratch1;
4268  Register ch2 = rscratch2;
4269  Register cnt1tmp = tmp1;
4270  Register cnt2tmp = tmp2;
4271  Register cnt1_neg = cnt1;
4272  Register cnt2_neg = cnt2;
4273  Register result_tmp = tmp4;
4274
4275  bool isL = ae == StrIntrinsicNode::LL;
4276
4277  bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4278  bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4279  int str1_chr_shift = str1_isL ? 0:1;
4280  int str2_chr_shift = str2_isL ? 0:1;
4281  int str1_chr_size = str1_isL ? 1:2;
4282  int str2_chr_size = str2_isL ? 1:2;
4283  chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4284                                      (chr_insn)&MacroAssembler::ldrh;
4285  chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4286                                      (chr_insn)&MacroAssembler::ldrh;
4287  chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4288  chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4289
4290  // Note, inline_string_indexOf() generates checks:
4291  // if (substr.count > string.count) return -1;
4292  // if (substr.count == 0) return 0;
4293
4294// We have two strings, a source string in str2, cnt2 and a pattern string
4295// in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4296
4297// For larger pattern and source we use a simplified Boyer Moore algorithm.
4298// With a small pattern and source we use linear scan.
4299
4300  if (icnt1 == -1) {
4301    cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4302    ccmp(cnt1, 8, 0b0000, LO);  // Can't handle skip >= 256 because we use
4303    br(LO, LINEARSEARCH);       // a byte array.
4304    cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
4305    br(HS, LINEARSEARCH);
4306  }
4307
4308// The Boyer Moore alogorithm is based on the description here:-
4309//
4310// http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4311//
4312// This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4313// and the 'Good Suffix' rule.
4314//
4315// These rules are essentially heuristics for how far we can shift the
4316// pattern along the search string.
4317//
4318// The implementation here uses the 'Bad Character' rule only because of the
4319// complexity of initialisation for the 'Good Suffix' rule.
4320//
4321// This is also known as the Boyer-Moore-Horspool algorithm:-
4322//
4323// http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4324//
4325// #define ASIZE 128
4326//
4327//    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4328//       int i, j;
4329//       unsigned c;
4330//       unsigned char bc[ASIZE];
4331//
4332//       /* Preprocessing */
4333//       for (i = 0; i < ASIZE; ++i)
4334//          bc[i] = 0;
4335//       for (i = 0; i < m - 1; ) {
4336//          c = x[i];
4337//          ++i;
4338//          if (c < ASIZE) bc[c] = i;
4339//       }
4340//
4341//       /* Searching */
4342//       j = 0;
4343//       while (j <= n - m) {
4344//          c = y[i+j];
4345//          if (x[m-1] == c)
4346//            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4347//          if (i < 0) return j;
4348//          if (c < ASIZE)
4349//            j = j - bc[y[j+m-1]] + m;
4350//          else
4351//            j += 1; // Advance by 1 only if char >= ASIZE
4352//       }
4353//    }
4354
4355  if (icnt1 == -1) {
4356    BIND(BM);
4357
4358    Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
4359    Label BMADV, BMMATCH, BMCHECKEND;
4360
4361    Register cnt1end = tmp2;
4362    Register str2end = cnt2;
4363    Register skipch = tmp2;
4364
4365    // Restrict ASIZE to 128 to reduce stack space/initialisation.
4366    // The presence of chars >= ASIZE in the target string does not affect
4367    // performance, but we must be careful not to initialise them in the stack
4368    // array.
4369    // The presence of chars >= ASIZE in the source string may adversely affect
4370    // performance since we can only advance by one when we encounter one.
4371
4372      stp(zr, zr, pre(sp, -128));
4373      for (int i = 1; i < 8; i++)
4374          stp(zr, zr, Address(sp, i*16));
4375
4376      mov(cnt1tmp, 0);
4377      sub(cnt1end, cnt1, 1);
4378    BIND(BCLOOP);
4379      (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4380      cmp(ch1, 128);
4381      add(cnt1tmp, cnt1tmp, 1);
4382      br(HS, BCSKIP);
4383      strb(cnt1tmp, Address(sp, ch1));
4384    BIND(BCSKIP);
4385      cmp(cnt1tmp, cnt1end);
4386      br(LT, BCLOOP);
4387
4388      mov(result_tmp, str2);
4389
4390      sub(cnt2, cnt2, cnt1);
4391      add(str2end, str2, cnt2, LSL, str2_chr_shift);
4392    BIND(BMLOOPSTR2);
4393      sub(cnt1tmp, cnt1, 1);
4394      (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4395      (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4396      cmp(ch1, skipch);
4397      br(NE, BMSKIP);
4398      subs(cnt1tmp, cnt1tmp, 1);
4399      br(LT, BMMATCH);
4400    BIND(BMLOOPSTR1);
4401      (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4402      (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4403      cmp(ch1, ch2);
4404      br(NE, BMSKIP);
4405      subs(cnt1tmp, cnt1tmp, 1);
4406      br(GE, BMLOOPSTR1);
4407    BIND(BMMATCH);
4408      sub(result, str2, result_tmp);
4409      if (!str2_isL) lsr(result, result, 1);
4410      add(sp, sp, 128);
4411      b(DONE);
4412    BIND(BMADV);
4413      add(str2, str2, str2_chr_size);
4414      b(BMCHECKEND);
4415    BIND(BMSKIP);
4416      cmp(skipch, 128);
4417      br(HS, BMADV);
4418      ldrb(ch2, Address(sp, skipch));
4419      add(str2, str2, cnt1, LSL, str2_chr_shift);
4420      sub(str2, str2, ch2, LSL, str2_chr_shift);
4421    BIND(BMCHECKEND);
4422      cmp(str2, str2end);
4423      br(LE, BMLOOPSTR2);
4424      add(sp, sp, 128);
4425      b(NOMATCH);
4426  }
4427
4428  BIND(LINEARSEARCH);
4429  {
4430    Label DO1, DO2, DO3;
4431
4432    Register str2tmp = tmp2;
4433    Register first = tmp3;
4434
4435    if (icnt1 == -1)
4436    {
4437        Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4438
4439        cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
4440        br(LT, DOSHORT);
4441
4442        sub(cnt2, cnt2, cnt1);
4443        mov(result_tmp, cnt2);
4444
4445        lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4446        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4447        sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4448        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4449        (this->*str1_load_1chr)(first, Address(str1, cnt1_neg));
4450
4451      BIND(FIRST_LOOP);
4452        (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4453        cmp(first, ch2);
4454        br(EQ, STR1_LOOP);
4455      BIND(STR2_NEXT);
4456        adds(cnt2_neg, cnt2_neg, str2_chr_size);
4457        br(LE, FIRST_LOOP);
4458        b(NOMATCH);
4459
4460      BIND(STR1_LOOP);
4461        adds(cnt1tmp, cnt1_neg, str1_chr_size);
4462        add(cnt2tmp, cnt2_neg, str2_chr_size);
4463        br(GE, MATCH);
4464
4465      BIND(STR1_NEXT);
4466        (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4467        (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4468        cmp(ch1, ch2);
4469        br(NE, STR2_NEXT);
4470        adds(cnt1tmp, cnt1tmp, str1_chr_size);
4471        add(cnt2tmp, cnt2tmp, str2_chr_size);
4472        br(LT, STR1_NEXT);
4473        b(MATCH);
4474
4475      BIND(DOSHORT);
4476      if (str1_isL == str2_isL) {
4477        cmp(cnt1, 2);
4478        br(LT, DO1);
4479        br(GT, DO3);
4480      }
4481    }
4482
4483    if (icnt1 == 4) {
4484      Label CH1_LOOP;
4485
4486        (this->*load_4chr)(ch1, str1);
4487        sub(cnt2, cnt2, 4);
4488        mov(result_tmp, cnt2);
4489        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4490        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4491
4492      BIND(CH1_LOOP);
4493        (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4494        cmp(ch1, ch2);
4495        br(EQ, MATCH);
4496        adds(cnt2_neg, cnt2_neg, str2_chr_size);
4497        br(LE, CH1_LOOP);
4498        b(NOMATCH);
4499    }
4500
4501    if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4502      Label CH1_LOOP;
4503
4504      BIND(DO2);
4505        (this->*load_2chr)(ch1, str1);
4506        sub(cnt2, cnt2, 2);
4507        mov(result_tmp, cnt2);
4508        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4509        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4510
4511      BIND(CH1_LOOP);
4512        (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4513        cmp(ch1, ch2);
4514        br(EQ, MATCH);
4515        adds(cnt2_neg, cnt2_neg, str2_chr_size);
4516        br(LE, CH1_LOOP);
4517        b(NOMATCH);
4518    }
4519
4520    if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4521      Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4522
4523      BIND(DO3);
4524        (this->*load_2chr)(first, str1);
4525        (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4526
4527        sub(cnt2, cnt2, 3);
4528        mov(result_tmp, cnt2);
4529        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4530        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4531
4532      BIND(FIRST_LOOP);
4533        (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4534        cmpw(first, ch2);
4535        br(EQ, STR1_LOOP);
4536      BIND(STR2_NEXT);
4537        adds(cnt2_neg, cnt2_neg, str2_chr_size);
4538        br(LE, FIRST_LOOP);
4539        b(NOMATCH);
4540
4541      BIND(STR1_LOOP);
4542        add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4543        (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4544        cmp(ch1, ch2);
4545        br(NE, STR2_NEXT);
4546        b(MATCH);
4547    }
4548
4549    if (icnt1 == -1 || icnt1 == 1) {
4550      Label CH1_LOOP, HAS_ZERO;
4551      Label DO1_SHORT, DO1_LOOP;
4552
4553      BIND(DO1);
4554        (this->*str1_load_1chr)(ch1, str1);
4555        cmp(cnt2, 8);
4556        br(LT, DO1_SHORT);
4557
4558        if (str2_isL) {
4559          if (!str1_isL) {
4560            tst(ch1, 0xff00);
4561            br(NE, NOMATCH);
4562          }
4563          orr(ch1, ch1, ch1, LSL, 8);
4564        }
4565        orr(ch1, ch1, ch1, LSL, 16);
4566        orr(ch1, ch1, ch1, LSL, 32);
4567
4568        sub(cnt2, cnt2, 8/str2_chr_size);
4569        mov(result_tmp, cnt2);
4570        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4571        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4572
4573        mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4574      BIND(CH1_LOOP);
4575        ldr(ch2, Address(str2, cnt2_neg));
4576        eor(ch2, ch1, ch2);
4577        sub(tmp1, ch2, tmp3);
4578        orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4579        bics(tmp1, tmp1, tmp2);
4580        br(NE, HAS_ZERO);
4581        adds(cnt2_neg, cnt2_neg, 8);
4582        br(LT, CH1_LOOP);
4583
4584        cmp(cnt2_neg, 8);
4585        mov(cnt2_neg, 0);
4586        br(LT, CH1_LOOP);
4587        b(NOMATCH);
4588
4589      BIND(HAS_ZERO);
4590        rev(tmp1, tmp1);
4591        clz(tmp1, tmp1);
4592        add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4593        b(MATCH);
4594
4595      BIND(DO1_SHORT);
4596        mov(result_tmp, cnt2);
4597        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4598        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4599      BIND(DO1_LOOP);
4600        (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4601        cmpw(ch1, ch2);
4602        br(EQ, MATCH);
4603        adds(cnt2_neg, cnt2_neg, str2_chr_size);
4604        br(LT, DO1_LOOP);
4605    }
4606  }
4607  BIND(NOMATCH);
4608    mov(result, -1);
4609    b(DONE);
4610  BIND(MATCH);
4611    add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4612  BIND(DONE);
4613}
4614
4615typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4616typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4617
4618void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4619                                         Register ch, Register result,
4620                                         Register tmp1, Register tmp2, Register tmp3)
4621{
4622  Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4623  Register cnt1_neg = cnt1;
4624  Register ch1 = rscratch1;
4625  Register result_tmp = rscratch2;
4626
4627  cmp(cnt1, 4);
4628  br(LT, DO1_SHORT);
4629
4630  orr(ch, ch, ch, LSL, 16);
4631  orr(ch, ch, ch, LSL, 32);
4632
4633  sub(cnt1, cnt1, 4);
4634  mov(result_tmp, cnt1);
4635  lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4636  sub(cnt1_neg, zr, cnt1, LSL, 1);
4637
4638  mov(tmp3, 0x0001000100010001);
4639
4640  BIND(CH1_LOOP);
4641    ldr(ch1, Address(str1, cnt1_neg));
4642    eor(ch1, ch, ch1);
4643    sub(tmp1, ch1, tmp3);
4644    orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4645    bics(tmp1, tmp1, tmp2);
4646    br(NE, HAS_ZERO);
4647    adds(cnt1_neg, cnt1_neg, 8);
4648    br(LT, CH1_LOOP);
4649
4650    cmp(cnt1_neg, 8);
4651    mov(cnt1_neg, 0);
4652    br(LT, CH1_LOOP);
4653    b(NOMATCH);
4654
4655  BIND(HAS_ZERO);
4656    rev(tmp1, tmp1);
4657    clz(tmp1, tmp1);
4658    add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4659    b(MATCH);
4660
4661  BIND(DO1_SHORT);
4662    mov(result_tmp, cnt1);
4663    lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4664    sub(cnt1_neg, zr, cnt1, LSL, 1);
4665  BIND(DO1_LOOP);
4666    ldrh(ch1, Address(str1, cnt1_neg));
4667    cmpw(ch, ch1);
4668    br(EQ, MATCH);
4669    adds(cnt1_neg, cnt1_neg, 2);
4670    br(LT, DO1_LOOP);
4671  BIND(NOMATCH);
4672    mov(result, -1);
4673    b(DONE);
4674  BIND(MATCH);
4675    add(result, result_tmp, cnt1_neg, ASR, 1);
4676  BIND(DONE);
4677}
4678
4679// Compare strings.
4680void MacroAssembler::string_compare(Register str1, Register str2,
4681                                    Register cnt1, Register cnt2, Register result,
4682                                    Register tmp1,
4683                                    FloatRegister vtmp, FloatRegister vtmpZ, int ae) {
4684  Label LENGTH_DIFF, DONE, SHORT_LOOP, SHORT_STRING,
4685    NEXT_WORD, DIFFERENCE;
4686
4687  bool isLL = ae == StrIntrinsicNode::LL;
4688  bool isLU = ae == StrIntrinsicNode::LU;
4689  bool isUL = ae == StrIntrinsicNode::UL;
4690
4691  bool str1_isL = isLL || isLU;
4692  bool str2_isL = isLL || isUL;
4693
4694  int str1_chr_shift = str1_isL ? 0 : 1;
4695  int str2_chr_shift = str2_isL ? 0 : 1;
4696  int str1_chr_size = str1_isL ? 1 : 2;
4697  int str2_chr_size = str2_isL ? 1 : 2;
4698
4699  chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4700                                      (chr_insn)&MacroAssembler::ldrh;
4701  chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4702                                      (chr_insn)&MacroAssembler::ldrh;
4703  uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4704                            (uxt_insn)&MacroAssembler::uxthw;
4705
4706  BLOCK_COMMENT("string_compare {");
4707
4708  // Bizzarely, the counts are passed in bytes, regardless of whether they
4709  // are L or U strings, however the result is always in characters.
4710  if (!str1_isL) asrw(cnt1, cnt1, 1);
4711  if (!str2_isL) asrw(cnt2, cnt2, 1);
4712
4713  // Compute the minimum of the string lengths and save the difference.
4714  subsw(tmp1, cnt1, cnt2);
4715  cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4716
4717  // A very short string
4718  cmpw(cnt2, isLL ? 8:4);
4719  br(Assembler::LT, SHORT_STRING);
4720
4721  // Check if the strings start at the same location.
4722  cmp(str1, str2);
4723  br(Assembler::EQ, LENGTH_DIFF);
4724
4725  // Compare longwords
4726  {
4727    subw(cnt2, cnt2, isLL ? 8:4); // The last longword is a special case
4728
4729    // Move both string pointers to the last longword of their
4730    // strings, negate the remaining count, and convert it to bytes.
4731    lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4732    lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4733    if (isLU || isUL) {
4734      sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4735      eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4736    }
4737    sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4738
4739    // Loop, loading longwords and comparing them into rscratch2.
4740    bind(NEXT_WORD);
4741    if (isLU) {
4742      ldrs(vtmp, Address(str1, cnt1));
4743      zip1(vtmp, T8B, vtmp, vtmpZ);
4744      umov(result, vtmp, D, 0);
4745    } else {
4746      ldr(result, Address(str1, isUL ? cnt1:cnt2));
4747    }
4748    if (isUL) {
4749      ldrs(vtmp, Address(str2, cnt2));
4750      zip1(vtmp, T8B, vtmp, vtmpZ);
4751      umov(rscratch1, vtmp, D, 0);
4752    } else {
4753      ldr(rscratch1, Address(str2, cnt2));
4754    }
4755    adds(cnt2, cnt2, isUL ? 4:8);
4756    if (isLU || isUL) add(cnt1, cnt1, isLU ? 4:8);
4757    eor(rscratch2, result, rscratch1);
4758    cbnz(rscratch2, DIFFERENCE);
4759    br(Assembler::LT, NEXT_WORD);
4760
4761    // Last longword.  In the case where length == 4 we compare the
4762    // same longword twice, but that's still faster than another
4763    // conditional branch.
4764
4765    if (isLU) {
4766      ldrs(vtmp, Address(str1));
4767      zip1(vtmp, T8B, vtmp, vtmpZ);
4768      umov(result, vtmp, D, 0);
4769    } else {
4770      ldr(result, Address(str1));
4771    }
4772    if (isUL) {
4773      ldrs(vtmp, Address(str2));
4774      zip1(vtmp, T8B, vtmp, vtmpZ);
4775      umov(rscratch1, vtmp, D, 0);
4776    } else {
4777      ldr(rscratch1, Address(str2));
4778    }
4779    eor(rscratch2, result, rscratch1);
4780    cbz(rscratch2, LENGTH_DIFF);
4781
4782    // Find the first different characters in the longwords and
4783    // compute their difference.
4784    bind(DIFFERENCE);
4785    rev(rscratch2, rscratch2);
4786    clz(rscratch2, rscratch2);
4787    andr(rscratch2, rscratch2, isLL ? -8 : -16);
4788    lsrv(result, result, rscratch2);
4789    (this->*ext_chr)(result, result);
4790    lsrv(rscratch1, rscratch1, rscratch2);
4791    (this->*ext_chr)(rscratch1, rscratch1);
4792    subw(result, result, rscratch1);
4793    b(DONE);
4794  }
4795
4796  bind(SHORT_STRING);
4797  // Is the minimum length zero?
4798  cbz(cnt2, LENGTH_DIFF);
4799
4800  bind(SHORT_LOOP);
4801  (this->*str1_load_chr)(result, Address(post(str1, str1_chr_size)));
4802  (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
4803  subw(result, result, cnt1);
4804  cbnz(result, DONE);
4805  sub(cnt2, cnt2, 1);
4806  cbnz(cnt2, SHORT_LOOP);
4807
4808  // Strings are equal up to min length.  Return the length difference.
4809  bind(LENGTH_DIFF);
4810  mov(result, tmp1);
4811
4812  // That's it
4813  bind(DONE);
4814
4815  BLOCK_COMMENT("} string_compare");
4816}
4817
4818// Compare Strings or char/byte arrays.
4819
4820// is_string is true iff this is a string comparison.
4821
4822// For Strings we're passed the address of the first characters in a1
4823// and a2 and the length in cnt1.
4824
4825// For byte and char arrays we're passed the arrays themselves and we
4826// have to extract length fields and do null checks here.
4827
4828// elem_size is the element size in bytes: either 1 or 2.
4829
4830// There are two implementations.  For arrays >= 8 bytes, all
4831// comparisons (including the final one, which may overlap) are
4832// performed 8 bytes at a time.  For arrays < 8 bytes, we compare a
4833// halfword, then a short, and then a byte.
4834
4835void MacroAssembler::arrays_equals(Register a1, Register a2,
4836                                   Register result, Register cnt1,
4837                                   int elem_size, bool is_string)
4838{
4839  Label SAME, DONE, SHORT, NEXT_WORD, ONE;
4840  Register tmp1 = rscratch1;
4841  Register tmp2 = rscratch2;
4842  Register cnt2 = tmp2;  // cnt2 only used in array length compare
4843  int elem_per_word = wordSize/elem_size;
4844  int log_elem_size = exact_log2(elem_size);
4845  int length_offset = arrayOopDesc::length_offset_in_bytes();
4846  int base_offset
4847    = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
4848
4849  assert(elem_size == 1 || elem_size == 2, "must be char or byte");
4850  assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
4851
4852#ifndef PRODUCT
4853  {
4854    const char kind = (elem_size == 2) ? 'U' : 'L';
4855    char comment[64];
4856    snprintf(comment, sizeof comment, "%s%c%s {",
4857             is_string ? "string_equals" : "array_equals",
4858             kind, "{");
4859    BLOCK_COMMENT(comment);
4860  }
4861#endif
4862
4863  mov(result, false);
4864
4865  if (!is_string) {
4866    // if (a==a2)
4867    //     return true;
4868    eor(rscratch1, a1, a2);
4869    cbz(rscratch1, SAME);
4870    // if (a==null || a2==null)
4871    //     return false;
4872    cbz(a1, DONE);
4873    cbz(a2, DONE);
4874    // if (a1.length != a2.length)
4875    //      return false;
4876    ldrw(cnt1, Address(a1, length_offset));
4877    ldrw(cnt2, Address(a2, length_offset));
4878    eorw(tmp1, cnt1, cnt2);
4879    cbnzw(tmp1, DONE);
4880
4881    lea(a1, Address(a1, base_offset));
4882    lea(a2, Address(a2, base_offset));
4883  }
4884
4885  // Check for short strings, i.e. smaller than wordSize.
4886  subs(cnt1, cnt1, elem_per_word);
4887  br(Assembler::LT, SHORT);
4888  // Main 8 byte comparison loop.
4889  bind(NEXT_WORD); {
4890    ldr(tmp1, Address(post(a1, wordSize)));
4891    ldr(tmp2, Address(post(a2, wordSize)));
4892    subs(cnt1, cnt1, elem_per_word);
4893    eor(tmp1, tmp1, tmp2);
4894    cbnz(tmp1, DONE);
4895  } br(GT, NEXT_WORD);
4896  // Last longword.  In the case where length == 4 we compare the
4897  // same longword twice, but that's still faster than another
4898  // conditional branch.
4899  // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
4900  // length == 4.
4901  if (log_elem_size > 0)
4902    lsl(cnt1, cnt1, log_elem_size);
4903  ldr(tmp1, Address(a1, cnt1));
4904  ldr(tmp2, Address(a2, cnt1));
4905  eor(tmp1, tmp1, tmp2);
4906  cbnz(tmp1, DONE);
4907  b(SAME);
4908
4909  bind(SHORT);
4910  Label TAIL03, TAIL01;
4911
4912  tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
4913  {
4914    ldrw(tmp1, Address(post(a1, 4)));
4915    ldrw(tmp2, Address(post(a2, 4)));
4916    eorw(tmp1, tmp1, tmp2);
4917    cbnzw(tmp1, DONE);
4918  }
4919  bind(TAIL03);
4920  tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
4921  {
4922    ldrh(tmp1, Address(post(a1, 2)));
4923    ldrh(tmp2, Address(post(a2, 2)));
4924    eorw(tmp1, tmp1, tmp2);
4925    cbnzw(tmp1, DONE);
4926  }
4927  bind(TAIL01);
4928  if (elem_size == 1) { // Only needed when comparing byte arrays.
4929    tbz(cnt1, 0, SAME); // 0-1 bytes left.
4930    {
4931      ldrb(tmp1, a1);
4932      ldrb(tmp2, a2);
4933      eorw(tmp1, tmp1, tmp2);
4934      cbnzw(tmp1, DONE);
4935    }
4936  }
4937  // Arrays are equal.
4938  bind(SAME);
4939  mov(result, true);
4940
4941  // That's it.
4942  bind(DONE);
4943  BLOCK_COMMENT(is_string ? "} string_equals" : "} array_equals");
4944}
4945
4946
4947// base:     Address of a buffer to be zeroed, 8 bytes aligned.
4948// cnt:      Count in HeapWords.
4949// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
4950void MacroAssembler::zero_words(Register base, Register cnt)
4951{
4952  if (UseBlockZeroing) {
4953    block_zero(base, cnt);
4954  } else {
4955    fill_words(base, cnt, zr);
4956  }
4957}
4958
4959// r10 = base:   Address of a buffer to be zeroed, 8 bytes aligned.
4960// cnt:          Immediate count in HeapWords.
4961// r11 = tmp:    For use as cnt if we need to call out
4962#define ShortArraySize (18 * BytesPerLong)
4963void MacroAssembler::zero_words(Register base, u_int64_t cnt)
4964{
4965  Register tmp = r11;
4966  int i = cnt & 1;  // store any odd word to start
4967  if (i) str(zr, Address(base));
4968
4969  if (cnt <= ShortArraySize / BytesPerLong) {
4970    for (; i < (int)cnt; i += 2)
4971      stp(zr, zr, Address(base, i * wordSize));
4972  } else if (UseBlockZeroing && cnt >= (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord)) {
4973    mov(tmp, cnt);
4974    block_zero(base, tmp, true);
4975  } else {
4976    const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
4977    int remainder = cnt % (2 * unroll);
4978    for (; i < remainder; i += 2)
4979      stp(zr, zr, Address(base, i * wordSize));
4980
4981    Label loop;
4982    Register cnt_reg = rscratch1;
4983    Register loop_base = rscratch2;
4984    cnt = cnt - remainder;
4985    mov(cnt_reg, cnt);
4986    // adjust base and prebias by -2 * wordSize so we can pre-increment
4987    add(loop_base, base, (remainder - 2) * wordSize);
4988    bind(loop);
4989    sub(cnt_reg, cnt_reg, 2 * unroll);
4990    for (i = 1; i < unroll; i++)
4991      stp(zr, zr, Address(loop_base, 2 * i * wordSize));
4992    stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
4993    cbnz(cnt_reg, loop);
4994  }
4995}
4996
4997// base:   Address of a buffer to be filled, 8 bytes aligned.
4998// cnt:    Count in 8-byte unit.
4999// value:  Value to be filled with.
5000// base will point to the end of the buffer after filling.
5001void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5002{
5003//  Algorithm:
5004//
5005//    scratch1 = cnt & 7;
5006//    cnt -= scratch1;
5007//    p += scratch1;
5008//    switch (scratch1) {
5009//      do {
5010//        cnt -= 8;
5011//          p[-8] = v;
5012//        case 7:
5013//          p[-7] = v;
5014//        case 6:
5015//          p[-6] = v;
5016//          // ...
5017//        case 1:
5018//          p[-1] = v;
5019//        case 0:
5020//          p += 8;
5021//      } while (cnt);
5022//    }
5023
5024  assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5025
5026  Label fini, skip, entry, loop;
5027  const int unroll = 8; // Number of stp instructions we'll unroll
5028
5029  cbz(cnt, fini);
5030  tbz(base, 3, skip);
5031  str(value, Address(post(base, 8)));
5032  sub(cnt, cnt, 1);
5033  bind(skip);
5034
5035  andr(rscratch1, cnt, (unroll-1) * 2);
5036  sub(cnt, cnt, rscratch1);
5037  add(base, base, rscratch1, Assembler::LSL, 3);
5038  adr(rscratch2, entry);
5039  sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5040  br(rscratch2);
5041
5042  bind(loop);
5043  add(base, base, unroll * 16);
5044  for (int i = -unroll; i < 0; i++)
5045    stp(value, value, Address(base, i * 16));
5046  bind(entry);
5047  subs(cnt, cnt, unroll * 2);
5048  br(Assembler::GE, loop);
5049
5050  tbz(cnt, 0, fini);
5051  str(value, Address(post(base, 8)));
5052  bind(fini);
5053}
5054
5055// Use DC ZVA to do fast zeroing.
5056// base:   Address of a buffer to be zeroed, 8 bytes aligned.
5057// cnt:    Count in HeapWords.
5058// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
5059void MacroAssembler::block_zero(Register base, Register cnt, bool is_large)
5060{
5061  Label small;
5062  Label store_pair, loop_store_pair, done;
5063  Label base_aligned;
5064
5065  assert_different_registers(base, cnt, rscratch1);
5066  guarantee(base == r10 && cnt == r11, "fix register usage");
5067
5068  Register tmp = rscratch1;
5069  Register tmp2 = rscratch2;
5070  int zva_length = VM_Version::zva_length();
5071
5072  // Ensure ZVA length can be divided by 16. This is required by
5073  // the subsequent operations.
5074  assert (zva_length % 16 == 0, "Unexpected ZVA Length");
5075
5076  if (!is_large) cbz(cnt, done);
5077  tbz(base, 3, base_aligned);
5078  str(zr, Address(post(base, 8)));
5079  sub(cnt, cnt, 1);
5080  bind(base_aligned);
5081
5082  // Ensure count >= zva_length * 2 so that it still deserves a zva after
5083  // alignment.
5084  if (!is_large || !(BlockZeroingLowLimit >= zva_length * 2)) {
5085    int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
5086    subs(tmp, cnt, low_limit >> 3);
5087    br(Assembler::LT, small);
5088  }
5089
5090  far_call(StubRoutines::aarch64::get_zero_longs());
5091
5092  bind(small);
5093
5094  const int unroll = 8; // Number of stp instructions we'll unroll
5095  Label small_loop, small_table_end;
5096
5097  andr(tmp, cnt, (unroll-1) * 2);
5098  sub(cnt, cnt, tmp);
5099  add(base, base, tmp, Assembler::LSL, 3);
5100  adr(tmp2, small_table_end);
5101  sub(tmp2, tmp2, tmp, Assembler::LSL, 1);
5102  br(tmp2);
5103
5104  bind(small_loop);
5105  add(base, base, unroll * 16);
5106  for (int i = -unroll; i < 0; i++)
5107    stp(zr, zr, Address(base, i * 16));
5108  bind(small_table_end);
5109  subs(cnt, cnt, unroll * 2);
5110  br(Assembler::GE, small_loop);
5111
5112  tbz(cnt, 0, done);
5113  str(zr, Address(post(base, 8)));
5114
5115  bind(done);
5116}
5117
5118// Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5119// java/lang/StringUTF16.compress.
5120void MacroAssembler::encode_iso_array(Register src, Register dst,
5121                      Register len, Register result,
5122                      FloatRegister Vtmp1, FloatRegister Vtmp2,
5123                      FloatRegister Vtmp3, FloatRegister Vtmp4)
5124{
5125    Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1;
5126    Register tmp1 = rscratch1;
5127
5128      mov(result, len); // Save initial len
5129
5130#ifndef BUILTIN_SIM
5131      subs(len, len, 32);
5132      br(LT, LOOP_8);
5133
5134// The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions
5135// to convert chars to bytes. These set the 'QC' bit in the FPSR if
5136// any char could not fit in a byte, so clear the FPSR so we can test it.
5137      clear_fpsr();
5138
5139    BIND(NEXT_32);
5140      ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5141      uqxtn(Vtmp1, T8B, Vtmp1, T8H);  // uqxtn  - write bottom half
5142      uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half
5143      uqxtn(Vtmp2, T8B, Vtmp3, T8H);
5144      uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2
5145      get_fpsr(tmp1);
5146      cbnzw(tmp1, LOOP_8);
5147      st1(Vtmp1, Vtmp2, T16B, post(dst, 32));
5148      subs(len, len, 32);
5149      add(src, src, 64);
5150      br(GE, NEXT_32);
5151
5152    BIND(LOOP_8);
5153      adds(len, len, 32-8);
5154      br(LT, LOOP_1);
5155      clear_fpsr(); // QC may be set from loop above, clear again
5156    BIND(NEXT_8);
5157      ld1(Vtmp1, T8H, src);
5158      uqxtn(Vtmp1, T8B, Vtmp1, T8H);
5159      get_fpsr(tmp1);
5160      cbnzw(tmp1, LOOP_1);
5161      st1(Vtmp1, T8B, post(dst, 8));
5162      subs(len, len, 8);
5163      add(src, src, 16);
5164      br(GE, NEXT_8);
5165
5166    BIND(LOOP_1);
5167      adds(len, len, 8);
5168      br(LE, DONE);
5169#else
5170      cbz(len, DONE);
5171#endif
5172    BIND(NEXT_1);
5173      ldrh(tmp1, Address(post(src, 2)));
5174      tst(tmp1, 0xff00);
5175      br(NE, DONE);
5176      strb(tmp1, Address(post(dst, 1)));
5177      subs(len, len, 1);
5178      br(GT, NEXT_1);
5179
5180    BIND(DONE);
5181      sub(result, result, len); // Return index where we stopped
5182                                // Return len == 0 if we processed all
5183                                // characters
5184}
5185
5186
5187// Inflate byte[] array to char[].
5188void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5189                                        FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5190                                        Register tmp4) {
5191  Label big, done;
5192
5193  assert_different_registers(src, dst, len, tmp4, rscratch1);
5194
5195  fmovd(vtmp1 , zr);
5196  lsrw(rscratch1, len, 3);
5197
5198  cbnzw(rscratch1, big);
5199
5200  // Short string: less than 8 bytes.
5201  {
5202    Label loop, around, tiny;
5203
5204    subsw(len, len, 4);
5205    andw(len, len, 3);
5206    br(LO, tiny);
5207
5208    // Use SIMD to do 4 bytes.
5209    ldrs(vtmp2, post(src, 4));
5210    zip1(vtmp3, T8B, vtmp2, vtmp1);
5211    strd(vtmp3, post(dst, 8));
5212
5213    cbzw(len, done);
5214
5215    // Do the remaining bytes by steam.
5216    bind(loop);
5217    ldrb(tmp4, post(src, 1));
5218    strh(tmp4, post(dst, 2));
5219    subw(len, len, 1);
5220
5221    bind(tiny);
5222    cbnz(len, loop);
5223
5224    bind(around);
5225    b(done);
5226  }
5227
5228  // Unpack the bytes 8 at a time.
5229  bind(big);
5230  andw(len, len, 7);
5231
5232  {
5233    Label loop, around;
5234
5235    bind(loop);
5236    ldrd(vtmp2, post(src, 8));
5237    sub(rscratch1, rscratch1, 1);
5238    zip1(vtmp3, T16B, vtmp2, vtmp1);
5239    st1(vtmp3, T8H, post(dst, 16));
5240    cbnz(rscratch1, loop);
5241
5242    bind(around);
5243  }
5244
5245  // Do the tail of up to 8 bytes.
5246  sub(src, src, 8);
5247  add(src, src, len, ext::uxtw, 0);
5248  ldrd(vtmp2, Address(src));
5249  sub(dst, dst, 16);
5250  add(dst, dst, len, ext::uxtw, 1);
5251  zip1(vtmp3, T16B, vtmp2, vtmp1);
5252  st1(vtmp3, T8H, Address(dst));
5253
5254  bind(done);
5255}
5256
5257// Compress char[] array to byte[].
5258void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5259                                         FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5260                                         FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5261                                         Register result) {
5262  encode_iso_array(src, dst, len, result,
5263                   tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5264  cmp(len, zr);
5265  csel(result, result, zr, EQ);
5266}
5267
5268// get_thread() can be called anywhere inside generated code so we
5269// need to save whatever non-callee save context might get clobbered
5270// by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5271// the call setup code.
5272//
5273// aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5274//
5275void MacroAssembler::get_thread(Register dst) {
5276  RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5277  push(saved_regs, sp);
5278
5279  mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5280  blrt(lr, 1, 0, 1);
5281  if (dst != c_rarg0) {
5282    mov(dst, c_rarg0);
5283  }
5284
5285  pop(saved_regs, sp);
5286}
5287