macroAssembler_aarch64.cpp revision 9801:80f8be586fae
1/*
2 * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26#include <sys/types.h>
27
28#include "precompiled.hpp"
29#include "asm/assembler.hpp"
30#include "asm/assembler.inline.hpp"
31#include "interpreter/interpreter.hpp"
32
33#include "compiler/disassembler.hpp"
34#include "memory/resourceArea.hpp"
35#include "nativeInst_aarch64.hpp"
36#include "oops/klass.inline.hpp"
37#include "oops/oop.inline.hpp"
38#include "opto/compile.hpp"
39#include "opto/node.hpp"
40#include "runtime/biasedLocking.hpp"
41#include "runtime/icache.hpp"
42#include "runtime/interfaceSupport.hpp"
43#include "runtime/sharedRuntime.hpp"
44#include "runtime/thread.hpp"
45
46#if INCLUDE_ALL_GCS
47#include "gc/g1/g1CollectedHeap.inline.hpp"
48#include "gc/g1/g1SATBCardTableModRefBS.hpp"
49#include "gc/g1/heapRegion.hpp"
50#endif
51
52#ifdef PRODUCT
53#define BLOCK_COMMENT(str) /* nothing */
54#define STOP(error) stop(error)
55#else
56#define BLOCK_COMMENT(str) block_comment(str)
57#define STOP(error) block_comment(error); stop(error)
58#endif
59
60#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
61
62// Patch any kind of instruction; there may be several instructions.
63// Return the total length (in bytes) of the instructions.
64int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
65  int instructions = 1;
66  assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
67  long offset = (target - branch) >> 2;
68  unsigned insn = *(unsigned*)branch;
69  if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
70    // Load register (literal)
71    Instruction_aarch64::spatch(branch, 23, 5, offset);
72  } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
73    // Unconditional branch (immediate)
74    Instruction_aarch64::spatch(branch, 25, 0, offset);
75  } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
76    // Conditional branch (immediate)
77    Instruction_aarch64::spatch(branch, 23, 5, offset);
78  } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
79    // Compare & branch (immediate)
80    Instruction_aarch64::spatch(branch, 23, 5, offset);
81  } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
82    // Test & branch (immediate)
83    Instruction_aarch64::spatch(branch, 18, 5, offset);
84  } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
85    // PC-rel. addressing
86    offset = target-branch;
87    int shift = Instruction_aarch64::extract(insn, 31, 31);
88    if (shift) {
89      u_int64_t dest = (u_int64_t)target;
90      uint64_t pc_page = (uint64_t)branch >> 12;
91      uint64_t adr_page = (uint64_t)target >> 12;
92      unsigned offset_lo = dest & 0xfff;
93      offset = adr_page - pc_page;
94
95      // We handle 4 types of PC relative addressing
96      //   1 - adrp    Rx, target_page
97      //       ldr/str Ry, [Rx, #offset_in_page]
98      //   2 - adrp    Rx, target_page
99      //       add     Ry, Rx, #offset_in_page
100      //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
101      //       movk    Rx, #imm16<<32
102      //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
103      // In the first 3 cases we must check that Rx is the same in the adrp and the
104      // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
105      // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
106      // to be followed by a random unrelated ldr/str, add or movk instruction.
107      //
108      unsigned insn2 = ((unsigned*)branch)[1];
109      if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
110                Instruction_aarch64::extract(insn, 4, 0) ==
111                        Instruction_aarch64::extract(insn2, 9, 5)) {
112        // Load/store register (unsigned immediate)
113        unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
114        Instruction_aarch64::patch(branch + sizeof (unsigned),
115                                    21, 10, offset_lo >> size);
116        guarantee(((dest >> size) << size) == dest, "misaligned target");
117        instructions = 2;
118      } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
119                Instruction_aarch64::extract(insn, 4, 0) ==
120                        Instruction_aarch64::extract(insn2, 4, 0)) {
121        // add (immediate)
122        Instruction_aarch64::patch(branch + sizeof (unsigned),
123                                   21, 10, offset_lo);
124        instructions = 2;
125      } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
126                   Instruction_aarch64::extract(insn, 4, 0) ==
127                     Instruction_aarch64::extract(insn2, 4, 0)) {
128        // movk #imm16<<32
129        Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
130        offset &= (1<<20)-1;
131        instructions = 2;
132      }
133    }
134    int offset_lo = offset & 3;
135    offset >>= 2;
136    Instruction_aarch64::spatch(branch, 23, 5, offset);
137    Instruction_aarch64::patch(branch, 30, 29, offset_lo);
138  } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
139    u_int64_t dest = (u_int64_t)target;
140    // Move wide constant
141    assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
142    assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
143    Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
144    Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
145    Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
146    assert(target_addr_for_insn(branch) == target, "should be");
147    instructions = 3;
148  } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
149             Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
150    // nothing to do
151    assert(target == 0, "did not expect to relocate target for polling page load");
152  } else {
153    ShouldNotReachHere();
154  }
155  return instructions * NativeInstruction::instruction_size;
156}
157
158int MacroAssembler::patch_oop(address insn_addr, address o) {
159  int instructions;
160  unsigned insn = *(unsigned*)insn_addr;
161  assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
162
163  // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
164  // narrow OOPs by setting the upper 16 bits in the first
165  // instruction.
166  if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
167    // Move narrow OOP
168    narrowOop n = oopDesc::encode_heap_oop((oop)o);
169    Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
170    Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
171    instructions = 2;
172  } else {
173    // Move wide OOP
174    assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
175    uintptr_t dest = (uintptr_t)o;
176    Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
177    Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
178    Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
179    instructions = 3;
180  }
181  return instructions * NativeInstruction::instruction_size;
182}
183
184address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
185  long offset = 0;
186  if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
187    // Load register (literal)
188    offset = Instruction_aarch64::sextract(insn, 23, 5);
189    return address(((uint64_t)insn_addr + (offset << 2)));
190  } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
191    // Unconditional branch (immediate)
192    offset = Instruction_aarch64::sextract(insn, 25, 0);
193  } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
194    // Conditional branch (immediate)
195    offset = Instruction_aarch64::sextract(insn, 23, 5);
196  } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
197    // Compare & branch (immediate)
198    offset = Instruction_aarch64::sextract(insn, 23, 5);
199   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
200    // Test & branch (immediate)
201    offset = Instruction_aarch64::sextract(insn, 18, 5);
202  } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
203    // PC-rel. addressing
204    offset = Instruction_aarch64::extract(insn, 30, 29);
205    offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
206    int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
207    if (shift) {
208      offset <<= shift;
209      uint64_t target_page = ((uint64_t)insn_addr) + offset;
210      target_page &= ((uint64_t)-1) << shift;
211      // Return the target address for the following sequences
212      //   1 - adrp    Rx, target_page
213      //       ldr/str Ry, [Rx, #offset_in_page]
214      //   2 - adrp    Rx, target_page
215      //       add     Ry, Rx, #offset_in_page
216      //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
217      //       movk    Rx, #imm12<<32
218      //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
219      //
220      // In the first two cases  we check that the register is the same and
221      // return the target_page + the offset within the page.
222      // Otherwise we assume it is a page aligned relocation and return
223      // the target page only.
224      //
225      unsigned insn2 = ((unsigned*)insn_addr)[1];
226      if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
227                Instruction_aarch64::extract(insn, 4, 0) ==
228                        Instruction_aarch64::extract(insn2, 9, 5)) {
229        // Load/store register (unsigned immediate)
230        unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
231        unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
232        return address(target_page + (byte_offset << size));
233      } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
234                Instruction_aarch64::extract(insn, 4, 0) ==
235                        Instruction_aarch64::extract(insn2, 4, 0)) {
236        // add (immediate)
237        unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
238        return address(target_page + byte_offset);
239      } else {
240        if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
241               Instruction_aarch64::extract(insn, 4, 0) ==
242                 Instruction_aarch64::extract(insn2, 4, 0)) {
243          target_page = (target_page & 0xffffffff) |
244                         ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
245        }
246        return (address)target_page;
247      }
248    } else {
249      ShouldNotReachHere();
250    }
251  } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
252    u_int32_t *insns = (u_int32_t *)insn_addr;
253    // Move wide constant: movz, movk, movk.  See movptr().
254    assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
255    assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
256    return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
257                   + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
258                   + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
259  } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
260             Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
261    return 0;
262  } else {
263    ShouldNotReachHere();
264  }
265  return address(((uint64_t)insn_addr + (offset << 2)));
266}
267
268void MacroAssembler::serialize_memory(Register thread, Register tmp) {
269  dsb(Assembler::SY);
270}
271
272
273void MacroAssembler::reset_last_Java_frame(bool clear_fp,
274                                           bool clear_pc) {
275  // we must set sp to zero to clear frame
276  str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
277  // must clear fp, so that compiled frames are not confused; it is
278  // possible that we need it only for debugging
279  if (clear_fp) {
280    str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
281  }
282
283  if (clear_pc) {
284    str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
285  }
286}
287
288// Calls to C land
289//
290// When entering C land, the rfp, & resp of the last Java frame have to be recorded
291// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
292// has to be reset to 0. This is required to allow proper stack traversal.
293void MacroAssembler::set_last_Java_frame(Register last_java_sp,
294                                         Register last_java_fp,
295                                         Register last_java_pc,
296                                         Register scratch) {
297
298  if (last_java_pc->is_valid()) {
299      str(last_java_pc, Address(rthread,
300                                JavaThread::frame_anchor_offset()
301                                + JavaFrameAnchor::last_Java_pc_offset()));
302    }
303
304  // determine last_java_sp register
305  if (last_java_sp == sp) {
306    mov(scratch, sp);
307    last_java_sp = scratch;
308  } else if (!last_java_sp->is_valid()) {
309    last_java_sp = esp;
310  }
311
312  str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
313
314  // last_java_fp is optional
315  if (last_java_fp->is_valid()) {
316    str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
317  }
318}
319
320void MacroAssembler::set_last_Java_frame(Register last_java_sp,
321                                         Register last_java_fp,
322                                         address  last_java_pc,
323                                         Register scratch) {
324  if (last_java_pc != NULL) {
325    adr(scratch, last_java_pc);
326  } else {
327    // FIXME: This is almost never correct.  We should delete all
328    // cases of set_last_Java_frame with last_java_pc=NULL and use the
329    // correct return address instead.
330    adr(scratch, pc());
331  }
332
333  str(scratch, Address(rthread,
334                       JavaThread::frame_anchor_offset()
335                       + JavaFrameAnchor::last_Java_pc_offset()));
336
337  set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
338}
339
340void MacroAssembler::set_last_Java_frame(Register last_java_sp,
341                                         Register last_java_fp,
342                                         Label &L,
343                                         Register scratch) {
344  if (L.is_bound()) {
345    set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
346  } else {
347    InstructionMark im(this);
348    L.add_patch_at(code(), locator());
349    set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
350  }
351}
352
353void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
354  assert(ReservedCodeCacheSize < 4*G, "branch out of range");
355  assert(CodeCache::find_blob(entry.target()) != NULL,
356         "destination of far call not found in code cache");
357  if (far_branches()) {
358    unsigned long offset;
359    // We can use ADRP here because we know that the total size of
360    // the code cache cannot exceed 2Gb.
361    adrp(tmp, entry, offset);
362    add(tmp, tmp, offset);
363    if (cbuf) cbuf->set_insts_mark();
364    blr(tmp);
365  } else {
366    if (cbuf) cbuf->set_insts_mark();
367    bl(entry);
368  }
369}
370
371void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
372  assert(ReservedCodeCacheSize < 4*G, "branch out of range");
373  assert(CodeCache::find_blob(entry.target()) != NULL,
374         "destination of far call not found in code cache");
375  if (far_branches()) {
376    unsigned long offset;
377    // We can use ADRP here because we know that the total size of
378    // the code cache cannot exceed 2Gb.
379    adrp(tmp, entry, offset);
380    add(tmp, tmp, offset);
381    if (cbuf) cbuf->set_insts_mark();
382    br(tmp);
383  } else {
384    if (cbuf) cbuf->set_insts_mark();
385    b(entry);
386  }
387}
388
389int MacroAssembler::biased_locking_enter(Register lock_reg,
390                                         Register obj_reg,
391                                         Register swap_reg,
392                                         Register tmp_reg,
393                                         bool swap_reg_contains_mark,
394                                         Label& done,
395                                         Label* slow_case,
396                                         BiasedLockingCounters* counters) {
397  assert(UseBiasedLocking, "why call this otherwise?");
398  assert_different_registers(lock_reg, obj_reg, swap_reg);
399
400  if (PrintBiasedLockingStatistics && counters == NULL)
401    counters = BiasedLocking::counters();
402
403  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
404  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
405  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
406  Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
407  Address saved_mark_addr(lock_reg, 0);
408
409  // Biased locking
410  // See whether the lock is currently biased toward our thread and
411  // whether the epoch is still valid
412  // Note that the runtime guarantees sufficient alignment of JavaThread
413  // pointers to allow age to be placed into low bits
414  // First check to see whether biasing is even enabled for this object
415  Label cas_label;
416  int null_check_offset = -1;
417  if (!swap_reg_contains_mark) {
418    null_check_offset = offset();
419    ldr(swap_reg, mark_addr);
420  }
421  andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
422  cmp(tmp_reg, markOopDesc::biased_lock_pattern);
423  br(Assembler::NE, cas_label);
424  // The bias pattern is present in the object's header. Need to check
425  // whether the bias owner and the epoch are both still current.
426  load_prototype_header(tmp_reg, obj_reg);
427  orr(tmp_reg, tmp_reg, rthread);
428  eor(tmp_reg, swap_reg, tmp_reg);
429  andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
430  if (counters != NULL) {
431    Label around;
432    cbnz(tmp_reg, around);
433    atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
434    b(done);
435    bind(around);
436  } else {
437    cbz(tmp_reg, done);
438  }
439
440  Label try_revoke_bias;
441  Label try_rebias;
442
443  // At this point we know that the header has the bias pattern and
444  // that we are not the bias owner in the current epoch. We need to
445  // figure out more details about the state of the header in order to
446  // know what operations can be legally performed on the object's
447  // header.
448
449  // If the low three bits in the xor result aren't clear, that means
450  // the prototype header is no longer biased and we have to revoke
451  // the bias on this object.
452  andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
453  cbnz(rscratch1, try_revoke_bias);
454
455  // Biasing is still enabled for this data type. See whether the
456  // epoch of the current bias is still valid, meaning that the epoch
457  // bits of the mark word are equal to the epoch bits of the
458  // prototype header. (Note that the prototype header's epoch bits
459  // only change at a safepoint.) If not, attempt to rebias the object
460  // toward the current thread. Note that we must be absolutely sure
461  // that the current epoch is invalid in order to do this because
462  // otherwise the manipulations it performs on the mark word are
463  // illegal.
464  andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
465  cbnz(rscratch1, try_rebias);
466
467  // The epoch of the current bias is still valid but we know nothing
468  // about the owner; it might be set or it might be clear. Try to
469  // acquire the bias of the object using an atomic operation. If this
470  // fails we will go in to the runtime to revoke the object's bias.
471  // Note that we first construct the presumed unbiased header so we
472  // don't accidentally blow away another thread's valid bias.
473  {
474    Label here;
475    mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
476    andr(swap_reg, swap_reg, rscratch1);
477    orr(tmp_reg, swap_reg, rthread);
478    cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
479    // If the biasing toward our thread failed, this means that
480    // another thread succeeded in biasing it toward itself and we
481    // need to revoke that bias. The revocation will occur in the
482    // interpreter runtime in the slow case.
483    bind(here);
484    if (counters != NULL) {
485      atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
486                  tmp_reg, rscratch1, rscratch2);
487    }
488  }
489  b(done);
490
491  bind(try_rebias);
492  // At this point we know the epoch has expired, meaning that the
493  // current "bias owner", if any, is actually invalid. Under these
494  // circumstances _only_, we are allowed to use the current header's
495  // value as the comparison value when doing the cas to acquire the
496  // bias in the current epoch. In other words, we allow transfer of
497  // the bias from one thread to another directly in this situation.
498  //
499  // FIXME: due to a lack of registers we currently blow away the age
500  // bits in this situation. Should attempt to preserve them.
501  {
502    Label here;
503    load_prototype_header(tmp_reg, obj_reg);
504    orr(tmp_reg, rthread, tmp_reg);
505    cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
506    // If the biasing toward our thread failed, then another thread
507    // succeeded in biasing it toward itself and we need to revoke that
508    // bias. The revocation will occur in the runtime in the slow case.
509    bind(here);
510    if (counters != NULL) {
511      atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
512                  tmp_reg, rscratch1, rscratch2);
513    }
514  }
515  b(done);
516
517  bind(try_revoke_bias);
518  // The prototype mark in the klass doesn't have the bias bit set any
519  // more, indicating that objects of this data type are not supposed
520  // to be biased any more. We are going to try to reset the mark of
521  // this object to the prototype value and fall through to the
522  // CAS-based locking scheme. Note that if our CAS fails, it means
523  // that another thread raced us for the privilege of revoking the
524  // bias of this particular object, so it's okay to continue in the
525  // normal locking code.
526  //
527  // FIXME: due to a lack of registers we currently blow away the age
528  // bits in this situation. Should attempt to preserve them.
529  {
530    Label here, nope;
531    load_prototype_header(tmp_reg, obj_reg);
532    cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
533    bind(here);
534
535    // Fall through to the normal CAS-based lock, because no matter what
536    // the result of the above CAS, some thread must have succeeded in
537    // removing the bias bit from the object's header.
538    if (counters != NULL) {
539      atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
540                  rscratch1, rscratch2);
541    }
542    bind(nope);
543  }
544
545  bind(cas_label);
546
547  return null_check_offset;
548}
549
550void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
551  assert(UseBiasedLocking, "why call this otherwise?");
552
553  // Check for biased locking unlock case, which is a no-op
554  // Note: we do not have to check the thread ID for two reasons.
555  // First, the interpreter checks for IllegalMonitorStateException at
556  // a higher level. Second, if the bias was revoked while we held the
557  // lock, the object could not be rebiased toward another thread, so
558  // the bias bit would be clear.
559  ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
560  andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
561  cmp(temp_reg, markOopDesc::biased_lock_pattern);
562  br(Assembler::EQ, done);
563}
564
565
566// added to make this compile
567
568REGISTER_DEFINITION(Register, noreg);
569
570static void pass_arg0(MacroAssembler* masm, Register arg) {
571  if (c_rarg0 != arg ) {
572    masm->mov(c_rarg0, arg);
573  }
574}
575
576static void pass_arg1(MacroAssembler* masm, Register arg) {
577  if (c_rarg1 != arg ) {
578    masm->mov(c_rarg1, arg);
579  }
580}
581
582static void pass_arg2(MacroAssembler* masm, Register arg) {
583  if (c_rarg2 != arg ) {
584    masm->mov(c_rarg2, arg);
585  }
586}
587
588static void pass_arg3(MacroAssembler* masm, Register arg) {
589  if (c_rarg3 != arg ) {
590    masm->mov(c_rarg3, arg);
591  }
592}
593
594void MacroAssembler::call_VM_base(Register oop_result,
595                                  Register java_thread,
596                                  Register last_java_sp,
597                                  address  entry_point,
598                                  int      number_of_arguments,
599                                  bool     check_exceptions) {
600   // determine java_thread register
601  if (!java_thread->is_valid()) {
602    java_thread = rthread;
603  }
604
605  // determine last_java_sp register
606  if (!last_java_sp->is_valid()) {
607    last_java_sp = esp;
608  }
609
610  // debugging support
611  assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
612  assert(java_thread == rthread, "unexpected register");
613#ifdef ASSERT
614  // TraceBytecodes does not use r12 but saves it over the call, so don't verify
615  // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
616#endif // ASSERT
617
618  assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
619  assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
620
621  // push java thread (becomes first argument of C function)
622
623  mov(c_rarg0, java_thread);
624
625  // set last Java frame before call
626  assert(last_java_sp != rfp, "can't use rfp");
627
628  Label l;
629  set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
630
631  // do the call, remove parameters
632  MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
633
634  // reset last Java frame
635  // Only interpreter should have to clear fp
636  reset_last_Java_frame(true, true);
637
638   // C++ interp handles this in the interpreter
639  check_and_handle_popframe(java_thread);
640  check_and_handle_earlyret(java_thread);
641
642  if (check_exceptions) {
643    // check for pending exceptions (java_thread is set upon return)
644    ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
645    Label ok;
646    cbz(rscratch1, ok);
647    lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
648    br(rscratch1);
649    bind(ok);
650  }
651
652  // get oop result if there is one and reset the value in the thread
653  if (oop_result->is_valid()) {
654    get_vm_result(oop_result, java_thread);
655  }
656}
657
658void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
659  call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
660}
661
662// Maybe emit a call via a trampoline.  If the code cache is small
663// trampolines won't be emitted.
664
665address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
666  assert(entry.rspec().type() == relocInfo::runtime_call_type
667         || entry.rspec().type() == relocInfo::opt_virtual_call_type
668         || entry.rspec().type() == relocInfo::static_call_type
669         || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
670
671  unsigned int start_offset = offset();
672  if (far_branches() && !Compile::current()->in_scratch_emit_size()) {
673    address stub = emit_trampoline_stub(start_offset, entry.target());
674    if (stub == NULL) {
675      return NULL; // CodeCache is full
676    }
677  }
678
679  if (cbuf) cbuf->set_insts_mark();
680  relocate(entry.rspec());
681  if (Assembler::reachable_from_branch_at(pc(), entry.target())) {
682    bl(entry.target());
683  } else {
684    bl(pc());
685  }
686  // just need to return a non-null address
687  return pc();
688}
689
690
691// Emit a trampoline stub for a call to a target which is too far away.
692//
693// code sequences:
694//
695// call-site:
696//   branch-and-link to <destination> or <trampoline stub>
697//
698// Related trampoline stub for this call site in the stub section:
699//   load the call target from the constant pool
700//   branch (LR still points to the call site above)
701
702address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
703                                             address dest) {
704  address stub = start_a_stub(Compile::MAX_stubs_size/2);
705  if (stub == NULL) {
706    return NULL;  // CodeBuffer::expand failed
707  }
708
709  // Create a trampoline stub relocation which relates this trampoline stub
710  // with the call instruction at insts_call_instruction_offset in the
711  // instructions code-section.
712  align(wordSize);
713  relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
714                                            + insts_call_instruction_offset));
715  const int stub_start_offset = offset();
716
717  // Now, create the trampoline stub's code:
718  // - load the call
719  // - call
720  Label target;
721  ldr(rscratch1, target);
722  br(rscratch1);
723  bind(target);
724  assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
725         "should be");
726  emit_int64((int64_t)dest);
727
728  const address stub_start_addr = addr_at(stub_start_offset);
729
730  assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
731
732  end_a_stub();
733  return stub;
734}
735
736address MacroAssembler::ic_call(address entry, jint method_index) {
737  RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
738  // address const_ptr = long_constant((jlong)Universe::non_oop_word());
739  // unsigned long offset;
740  // ldr_constant(rscratch2, const_ptr);
741  movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
742  return trampoline_call(Address(entry, rh));
743}
744
745// Implementation of call_VM versions
746
747void MacroAssembler::call_VM(Register oop_result,
748                             address entry_point,
749                             bool check_exceptions) {
750  call_VM_helper(oop_result, entry_point, 0, check_exceptions);
751}
752
753void MacroAssembler::call_VM(Register oop_result,
754                             address entry_point,
755                             Register arg_1,
756                             bool check_exceptions) {
757  pass_arg1(this, arg_1);
758  call_VM_helper(oop_result, entry_point, 1, check_exceptions);
759}
760
761void MacroAssembler::call_VM(Register oop_result,
762                             address entry_point,
763                             Register arg_1,
764                             Register arg_2,
765                             bool check_exceptions) {
766  assert(arg_1 != c_rarg2, "smashed arg");
767  pass_arg2(this, arg_2);
768  pass_arg1(this, arg_1);
769  call_VM_helper(oop_result, entry_point, 2, check_exceptions);
770}
771
772void MacroAssembler::call_VM(Register oop_result,
773                             address entry_point,
774                             Register arg_1,
775                             Register arg_2,
776                             Register arg_3,
777                             bool check_exceptions) {
778  assert(arg_1 != c_rarg3, "smashed arg");
779  assert(arg_2 != c_rarg3, "smashed arg");
780  pass_arg3(this, arg_3);
781
782  assert(arg_1 != c_rarg2, "smashed arg");
783  pass_arg2(this, arg_2);
784
785  pass_arg1(this, arg_1);
786  call_VM_helper(oop_result, entry_point, 3, check_exceptions);
787}
788
789void MacroAssembler::call_VM(Register oop_result,
790                             Register last_java_sp,
791                             address entry_point,
792                             int number_of_arguments,
793                             bool check_exceptions) {
794  call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
795}
796
797void MacroAssembler::call_VM(Register oop_result,
798                             Register last_java_sp,
799                             address entry_point,
800                             Register arg_1,
801                             bool check_exceptions) {
802  pass_arg1(this, arg_1);
803  call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
804}
805
806void MacroAssembler::call_VM(Register oop_result,
807                             Register last_java_sp,
808                             address entry_point,
809                             Register arg_1,
810                             Register arg_2,
811                             bool check_exceptions) {
812
813  assert(arg_1 != c_rarg2, "smashed arg");
814  pass_arg2(this, arg_2);
815  pass_arg1(this, arg_1);
816  call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
817}
818
819void MacroAssembler::call_VM(Register oop_result,
820                             Register last_java_sp,
821                             address entry_point,
822                             Register arg_1,
823                             Register arg_2,
824                             Register arg_3,
825                             bool check_exceptions) {
826  assert(arg_1 != c_rarg3, "smashed arg");
827  assert(arg_2 != c_rarg3, "smashed arg");
828  pass_arg3(this, arg_3);
829  assert(arg_1 != c_rarg2, "smashed arg");
830  pass_arg2(this, arg_2);
831  pass_arg1(this, arg_1);
832  call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
833}
834
835
836void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
837  ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
838  str(zr, Address(java_thread, JavaThread::vm_result_offset()));
839  verify_oop(oop_result, "broken oop in call_VM_base");
840}
841
842void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
843  ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
844  str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
845}
846
847void MacroAssembler::align(int modulus) {
848  while (offset() % modulus != 0) nop();
849}
850
851// these are no-ops overridden by InterpreterMacroAssembler
852
853void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
854
855void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
856
857
858RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
859                                                      Register tmp,
860                                                      int offset) {
861  intptr_t value = *delayed_value_addr;
862  if (value != 0)
863    return RegisterOrConstant(value + offset);
864
865  // load indirectly to solve generation ordering problem
866  ldr(tmp, ExternalAddress((address) delayed_value_addr));
867
868  if (offset != 0)
869    add(tmp, tmp, offset);
870
871  return RegisterOrConstant(tmp);
872}
873
874
875void MacroAssembler:: notify(int type) {
876  if (type == bytecode_start) {
877    // set_last_Java_frame(esp, rfp, (address)NULL);
878    Assembler:: notify(type);
879    // reset_last_Java_frame(true, false);
880  }
881  else
882    Assembler:: notify(type);
883}
884
885// Look up the method for a megamorphic invokeinterface call.
886// The target method is determined by <intf_klass, itable_index>.
887// The receiver klass is in recv_klass.
888// On success, the result will be in method_result, and execution falls through.
889// On failure, execution transfers to the given label.
890void MacroAssembler::lookup_interface_method(Register recv_klass,
891                                             Register intf_klass,
892                                             RegisterOrConstant itable_index,
893                                             Register method_result,
894                                             Register scan_temp,
895                                             Label& L_no_such_interface) {
896  assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
897  assert(itable_index.is_constant() || itable_index.as_register() == method_result,
898         "caller must use same register for non-constant itable index as for method");
899
900  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
901  int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
902  int itentry_off = itableMethodEntry::method_offset_in_bytes();
903  int scan_step   = itableOffsetEntry::size() * wordSize;
904  int vte_size    = vtableEntry::size() * wordSize;
905  assert(vte_size == wordSize, "else adjust times_vte_scale");
906
907  ldrw(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
908
909  // %%% Could store the aligned, prescaled offset in the klassoop.
910  // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
911  lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
912  add(scan_temp, scan_temp, vtable_base);
913  if (HeapWordsPerLong > 1) {
914    // Round up to align_object_offset boundary
915    // see code for instanceKlass::start_of_itable!
916    round_to(scan_temp, BytesPerLong);
917  }
918
919  // Adjust recv_klass by scaled itable_index, so we can free itable_index.
920  assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
921  // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
922  lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
923  if (itentry_off)
924    add(recv_klass, recv_klass, itentry_off);
925
926  // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
927  //   if (scan->interface() == intf) {
928  //     result = (klass + scan->offset() + itable_index);
929  //   }
930  // }
931  Label search, found_method;
932
933  for (int peel = 1; peel >= 0; peel--) {
934    ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
935    cmp(intf_klass, method_result);
936
937    if (peel) {
938      br(Assembler::EQ, found_method);
939    } else {
940      br(Assembler::NE, search);
941      // (invert the test to fall through to found_method...)
942    }
943
944    if (!peel)  break;
945
946    bind(search);
947
948    // Check that the previous entry is non-null.  A null entry means that
949    // the receiver class doesn't implement the interface, and wasn't the
950    // same as when the caller was compiled.
951    cbz(method_result, L_no_such_interface);
952    add(scan_temp, scan_temp, scan_step);
953  }
954
955  bind(found_method);
956
957  // Got a hit.
958  ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
959  ldr(method_result, Address(recv_klass, scan_temp));
960}
961
962// virtual method calling
963void MacroAssembler::lookup_virtual_method(Register recv_klass,
964                                           RegisterOrConstant vtable_index,
965                                           Register method_result) {
966  const int base = InstanceKlass::vtable_start_offset() * wordSize;
967  assert(vtableEntry::size() * wordSize == 8,
968         "adjust the scaling in the code below");
969  int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
970
971  if (vtable_index.is_register()) {
972    lea(method_result, Address(recv_klass,
973                               vtable_index.as_register(),
974                               Address::lsl(LogBytesPerWord)));
975    ldr(method_result, Address(method_result, vtable_offset_in_bytes));
976  } else {
977    vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
978    ldr(method_result, Address(recv_klass, vtable_offset_in_bytes));
979  }
980}
981
982void MacroAssembler::check_klass_subtype(Register sub_klass,
983                           Register super_klass,
984                           Register temp_reg,
985                           Label& L_success) {
986  Label L_failure;
987  check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
988  check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
989  bind(L_failure);
990}
991
992
993void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
994                                                   Register super_klass,
995                                                   Register temp_reg,
996                                                   Label* L_success,
997                                                   Label* L_failure,
998                                                   Label* L_slow_path,
999                                        RegisterOrConstant super_check_offset) {
1000  assert_different_registers(sub_klass, super_klass, temp_reg);
1001  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1002  if (super_check_offset.is_register()) {
1003    assert_different_registers(sub_klass, super_klass,
1004                               super_check_offset.as_register());
1005  } else if (must_load_sco) {
1006    assert(temp_reg != noreg, "supply either a temp or a register offset");
1007  }
1008
1009  Label L_fallthrough;
1010  int label_nulls = 0;
1011  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1012  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1013  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1014  assert(label_nulls <= 1, "at most one NULL in the batch");
1015
1016  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1017  int sco_offset = in_bytes(Klass::super_check_offset_offset());
1018  Address super_check_offset_addr(super_klass, sco_offset);
1019
1020  // Hacked jmp, which may only be used just before L_fallthrough.
1021#define final_jmp(label)                                                \
1022  if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1023  else                            b(label)                /*omit semi*/
1024
1025  // If the pointers are equal, we are done (e.g., String[] elements).
1026  // This self-check enables sharing of secondary supertype arrays among
1027  // non-primary types such as array-of-interface.  Otherwise, each such
1028  // type would need its own customized SSA.
1029  // We move this check to the front of the fast path because many
1030  // type checks are in fact trivially successful in this manner,
1031  // so we get a nicely predicted branch right at the start of the check.
1032  cmp(sub_klass, super_klass);
1033  br(Assembler::EQ, *L_success);
1034
1035  // Check the supertype display:
1036  if (must_load_sco) {
1037    ldrw(temp_reg, super_check_offset_addr);
1038    super_check_offset = RegisterOrConstant(temp_reg);
1039  }
1040  Address super_check_addr(sub_klass, super_check_offset);
1041  ldr(rscratch1, super_check_addr);
1042  cmp(super_klass, rscratch1); // load displayed supertype
1043
1044  // This check has worked decisively for primary supers.
1045  // Secondary supers are sought in the super_cache ('super_cache_addr').
1046  // (Secondary supers are interfaces and very deeply nested subtypes.)
1047  // This works in the same check above because of a tricky aliasing
1048  // between the super_cache and the primary super display elements.
1049  // (The 'super_check_addr' can address either, as the case requires.)
1050  // Note that the cache is updated below if it does not help us find
1051  // what we need immediately.
1052  // So if it was a primary super, we can just fail immediately.
1053  // Otherwise, it's the slow path for us (no success at this point).
1054
1055  if (super_check_offset.is_register()) {
1056    br(Assembler::EQ, *L_success);
1057    cmp(super_check_offset.as_register(), sc_offset);
1058    if (L_failure == &L_fallthrough) {
1059      br(Assembler::EQ, *L_slow_path);
1060    } else {
1061      br(Assembler::NE, *L_failure);
1062      final_jmp(*L_slow_path);
1063    }
1064  } else if (super_check_offset.as_constant() == sc_offset) {
1065    // Need a slow path; fast failure is impossible.
1066    if (L_slow_path == &L_fallthrough) {
1067      br(Assembler::EQ, *L_success);
1068    } else {
1069      br(Assembler::NE, *L_slow_path);
1070      final_jmp(*L_success);
1071    }
1072  } else {
1073    // No slow path; it's a fast decision.
1074    if (L_failure == &L_fallthrough) {
1075      br(Assembler::EQ, *L_success);
1076    } else {
1077      br(Assembler::NE, *L_failure);
1078      final_jmp(*L_success);
1079    }
1080  }
1081
1082  bind(L_fallthrough);
1083
1084#undef final_jmp
1085}
1086
1087// These two are taken from x86, but they look generally useful
1088
1089// scans count pointer sized words at [addr] for occurence of value,
1090// generic
1091void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1092                                Register scratch) {
1093  Label Lloop, Lexit;
1094  cbz(count, Lexit);
1095  bind(Lloop);
1096  ldr(scratch, post(addr, wordSize));
1097  cmp(value, scratch);
1098  br(EQ, Lexit);
1099  sub(count, count, 1);
1100  cbnz(count, Lloop);
1101  bind(Lexit);
1102}
1103
1104// scans count 4 byte words at [addr] for occurence of value,
1105// generic
1106void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1107                                Register scratch) {
1108  Label Lloop, Lexit;
1109  cbz(count, Lexit);
1110  bind(Lloop);
1111  ldrw(scratch, post(addr, wordSize));
1112  cmpw(value, scratch);
1113  br(EQ, Lexit);
1114  sub(count, count, 1);
1115  cbnz(count, Lloop);
1116  bind(Lexit);
1117}
1118
1119void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1120                                                   Register super_klass,
1121                                                   Register temp_reg,
1122                                                   Register temp2_reg,
1123                                                   Label* L_success,
1124                                                   Label* L_failure,
1125                                                   bool set_cond_codes) {
1126  assert_different_registers(sub_klass, super_klass, temp_reg);
1127  if (temp2_reg != noreg)
1128    assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1129#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1130
1131  Label L_fallthrough;
1132  int label_nulls = 0;
1133  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1134  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1135  assert(label_nulls <= 1, "at most one NULL in the batch");
1136
1137  // a couple of useful fields in sub_klass:
1138  int ss_offset = in_bytes(Klass::secondary_supers_offset());
1139  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1140  Address secondary_supers_addr(sub_klass, ss_offset);
1141  Address super_cache_addr(     sub_klass, sc_offset);
1142
1143  BLOCK_COMMENT("check_klass_subtype_slow_path");
1144
1145  // Do a linear scan of the secondary super-klass chain.
1146  // This code is rarely used, so simplicity is a virtue here.
1147  // The repne_scan instruction uses fixed registers, which we must spill.
1148  // Don't worry too much about pre-existing connections with the input regs.
1149
1150  assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1151  assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1152
1153  // Get super_klass value into r0 (even if it was in r5 or r2).
1154  RegSet pushed_registers;
1155  if (!IS_A_TEMP(r2))    pushed_registers += r2;
1156  if (!IS_A_TEMP(r5))    pushed_registers += r5;
1157
1158  if (super_klass != r0 || UseCompressedOops) {
1159    if (!IS_A_TEMP(r0))   pushed_registers += r0;
1160  }
1161
1162  push(pushed_registers, sp);
1163
1164#ifndef PRODUCT
1165  mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1166  Address pst_counter_addr(rscratch2);
1167  ldr(rscratch1, pst_counter_addr);
1168  add(rscratch1, rscratch1, 1);
1169  str(rscratch1, pst_counter_addr);
1170#endif //PRODUCT
1171
1172  // We will consult the secondary-super array.
1173  ldr(r5, secondary_supers_addr);
1174  // Load the array length.
1175  ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1176  // Skip to start of data.
1177  add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1178
1179  cmp(sp, zr); // Clear Z flag; SP is never zero
1180  // Scan R2 words at [R5] for an occurrence of R0.
1181  // Set NZ/Z based on last compare.
1182  repne_scan(r5, r0, r2, rscratch1);
1183
1184  // Unspill the temp. registers:
1185  pop(pushed_registers, sp);
1186
1187  br(Assembler::NE, *L_failure);
1188
1189  // Success.  Cache the super we found and proceed in triumph.
1190  str(super_klass, super_cache_addr);
1191
1192  if (L_success != &L_fallthrough) {
1193    b(*L_success);
1194  }
1195
1196#undef IS_A_TEMP
1197
1198  bind(L_fallthrough);
1199}
1200
1201
1202void MacroAssembler::verify_oop(Register reg, const char* s) {
1203  if (!VerifyOops) return;
1204
1205  // Pass register number to verify_oop_subroutine
1206  const char* b = NULL;
1207  {
1208    ResourceMark rm;
1209    stringStream ss;
1210    ss.print("verify_oop: %s: %s", reg->name(), s);
1211    b = code_string(ss.as_string());
1212  }
1213  BLOCK_COMMENT("verify_oop {");
1214
1215  stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1216  stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1217
1218  mov(r0, reg);
1219  mov(rscratch1, (address)b);
1220
1221  // call indirectly to solve generation ordering problem
1222  lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1223  ldr(rscratch2, Address(rscratch2));
1224  blr(rscratch2);
1225
1226  ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1227  ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1228
1229  BLOCK_COMMENT("} verify_oop");
1230}
1231
1232void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1233  if (!VerifyOops) return;
1234
1235  const char* b = NULL;
1236  {
1237    ResourceMark rm;
1238    stringStream ss;
1239    ss.print("verify_oop_addr: %s", s);
1240    b = code_string(ss.as_string());
1241  }
1242  BLOCK_COMMENT("verify_oop_addr {");
1243
1244  stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1245  stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1246
1247  // addr may contain sp so we will have to adjust it based on the
1248  // pushes that we just did.
1249  if (addr.uses(sp)) {
1250    lea(r0, addr);
1251    ldr(r0, Address(r0, 4 * wordSize));
1252  } else {
1253    ldr(r0, addr);
1254  }
1255  mov(rscratch1, (address)b);
1256
1257  // call indirectly to solve generation ordering problem
1258  lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1259  ldr(rscratch2, Address(rscratch2));
1260  blr(rscratch2);
1261
1262  ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1263  ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1264
1265  BLOCK_COMMENT("} verify_oop_addr");
1266}
1267
1268Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1269                                         int extra_slot_offset) {
1270  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1271  int stackElementSize = Interpreter::stackElementSize;
1272  int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1273#ifdef ASSERT
1274  int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1275  assert(offset1 - offset == stackElementSize, "correct arithmetic");
1276#endif
1277  if (arg_slot.is_constant()) {
1278    return Address(esp, arg_slot.as_constant() * stackElementSize
1279                   + offset);
1280  } else {
1281    add(rscratch1, esp, arg_slot.as_register(),
1282        ext::uxtx, exact_log2(stackElementSize));
1283    return Address(rscratch1, offset);
1284  }
1285}
1286
1287void MacroAssembler::call_VM_leaf_base(address entry_point,
1288                                       int number_of_arguments,
1289                                       Label *retaddr) {
1290  call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1291}
1292
1293void MacroAssembler::call_VM_leaf_base1(address entry_point,
1294                                        int number_of_gp_arguments,
1295                                        int number_of_fp_arguments,
1296                                        ret_type type,
1297                                        Label *retaddr) {
1298  Label E, L;
1299
1300  stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1301
1302  // We add 1 to number_of_arguments because the thread in arg0 is
1303  // not counted
1304  mov(rscratch1, entry_point);
1305  blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1306  if (retaddr)
1307    bind(*retaddr);
1308
1309  ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1310  maybe_isb();
1311}
1312
1313void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1314  call_VM_leaf_base(entry_point, number_of_arguments);
1315}
1316
1317void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1318  pass_arg0(this, arg_0);
1319  call_VM_leaf_base(entry_point, 1);
1320}
1321
1322void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1323  pass_arg0(this, arg_0);
1324  pass_arg1(this, arg_1);
1325  call_VM_leaf_base(entry_point, 2);
1326}
1327
1328void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1329                                  Register arg_1, Register arg_2) {
1330  pass_arg0(this, arg_0);
1331  pass_arg1(this, arg_1);
1332  pass_arg2(this, arg_2);
1333  call_VM_leaf_base(entry_point, 3);
1334}
1335
1336void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1337  pass_arg0(this, arg_0);
1338  MacroAssembler::call_VM_leaf_base(entry_point, 1);
1339}
1340
1341void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1342
1343  assert(arg_0 != c_rarg1, "smashed arg");
1344  pass_arg1(this, arg_1);
1345  pass_arg0(this, arg_0);
1346  MacroAssembler::call_VM_leaf_base(entry_point, 2);
1347}
1348
1349void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1350  assert(arg_0 != c_rarg2, "smashed arg");
1351  assert(arg_1 != c_rarg2, "smashed arg");
1352  pass_arg2(this, arg_2);
1353  assert(arg_0 != c_rarg1, "smashed arg");
1354  pass_arg1(this, arg_1);
1355  pass_arg0(this, arg_0);
1356  MacroAssembler::call_VM_leaf_base(entry_point, 3);
1357}
1358
1359void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1360  assert(arg_0 != c_rarg3, "smashed arg");
1361  assert(arg_1 != c_rarg3, "smashed arg");
1362  assert(arg_2 != c_rarg3, "smashed arg");
1363  pass_arg3(this, arg_3);
1364  assert(arg_0 != c_rarg2, "smashed arg");
1365  assert(arg_1 != c_rarg2, "smashed arg");
1366  pass_arg2(this, arg_2);
1367  assert(arg_0 != c_rarg1, "smashed arg");
1368  pass_arg1(this, arg_1);
1369  pass_arg0(this, arg_0);
1370  MacroAssembler::call_VM_leaf_base(entry_point, 4);
1371}
1372
1373void MacroAssembler::null_check(Register reg, int offset) {
1374  if (needs_explicit_null_check(offset)) {
1375    // provoke OS NULL exception if reg = NULL by
1376    // accessing M[reg] w/o changing any registers
1377    // NOTE: this is plenty to provoke a segv
1378    ldr(zr, Address(reg));
1379  } else {
1380    // nothing to do, (later) access of M[reg + offset]
1381    // will provoke OS NULL exception if reg = NULL
1382  }
1383}
1384
1385// MacroAssembler protected routines needed to implement
1386// public methods
1387
1388void MacroAssembler::mov(Register r, Address dest) {
1389  code_section()->relocate(pc(), dest.rspec());
1390  u_int64_t imm64 = (u_int64_t)dest.target();
1391  movptr(r, imm64);
1392}
1393
1394// Move a constant pointer into r.  In AArch64 mode the virtual
1395// address space is 48 bits in size, so we only need three
1396// instructions to create a patchable instruction sequence that can
1397// reach anywhere.
1398void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1399#ifndef PRODUCT
1400  {
1401    char buffer[64];
1402    snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1403    block_comment(buffer);
1404  }
1405#endif
1406  assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1407  movz(r, imm64 & 0xffff);
1408  imm64 >>= 16;
1409  movk(r, imm64 & 0xffff, 16);
1410  imm64 >>= 16;
1411  movk(r, imm64 & 0xffff, 32);
1412}
1413
1414// Macro to mov replicated immediate to vector register.
1415//  Vd will get the following values for different arrangements in T
1416//   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1417//   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1418//   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1419//   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1420//   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1421//   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1422//   T1D/T2D: invalid
1423void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1424  assert(T != T1D && T != T2D, "invalid arrangement");
1425  if (T == T8B || T == T16B) {
1426    assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1427    movi(Vd, T, imm32 & 0xff, 0);
1428    return;
1429  }
1430  u_int32_t nimm32 = ~imm32;
1431  if (T == T4H || T == T8H) {
1432    assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1433    imm32 &= 0xffff;
1434    nimm32 &= 0xffff;
1435  }
1436  u_int32_t x = imm32;
1437  int movi_cnt = 0;
1438  int movn_cnt = 0;
1439  while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1440  x = nimm32;
1441  while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1442  if (movn_cnt < movi_cnt) imm32 = nimm32;
1443  unsigned lsl = 0;
1444  while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1445  if (movn_cnt < movi_cnt)
1446    mvni(Vd, T, imm32 & 0xff, lsl);
1447  else
1448    movi(Vd, T, imm32 & 0xff, lsl);
1449  imm32 >>= 8; lsl += 8;
1450  while (imm32) {
1451    while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1452    if (movn_cnt < movi_cnt)
1453      bici(Vd, T, imm32 & 0xff, lsl);
1454    else
1455      orri(Vd, T, imm32 & 0xff, lsl);
1456    lsl += 8; imm32 >>= 8;
1457  }
1458}
1459
1460void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1461{
1462#ifndef PRODUCT
1463  {
1464    char buffer[64];
1465    snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1466    block_comment(buffer);
1467  }
1468#endif
1469  if (operand_valid_for_logical_immediate(false, imm64)) {
1470    orr(dst, zr, imm64);
1471  } else {
1472    // we can use a combination of MOVZ or MOVN with
1473    // MOVK to build up the constant
1474    u_int64_t imm_h[4];
1475    int zero_count = 0;
1476    int neg_count = 0;
1477    int i;
1478    for (i = 0; i < 4; i++) {
1479      imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1480      if (imm_h[i] == 0) {
1481        zero_count++;
1482      } else if (imm_h[i] == 0xffffL) {
1483        neg_count++;
1484      }
1485    }
1486    if (zero_count == 4) {
1487      // one MOVZ will do
1488      movz(dst, 0);
1489    } else if (neg_count == 4) {
1490      // one MOVN will do
1491      movn(dst, 0);
1492    } else if (zero_count == 3) {
1493      for (i = 0; i < 4; i++) {
1494        if (imm_h[i] != 0L) {
1495          movz(dst, (u_int32_t)imm_h[i], (i << 4));
1496          break;
1497        }
1498      }
1499    } else if (neg_count == 3) {
1500      // one MOVN will do
1501      for (int i = 0; i < 4; i++) {
1502        if (imm_h[i] != 0xffffL) {
1503          movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1504          break;
1505        }
1506      }
1507    } else if (zero_count == 2) {
1508      // one MOVZ and one MOVK will do
1509      for (i = 0; i < 3; i++) {
1510        if (imm_h[i] != 0L) {
1511          movz(dst, (u_int32_t)imm_h[i], (i << 4));
1512          i++;
1513          break;
1514        }
1515      }
1516      for (;i < 4; i++) {
1517        if (imm_h[i] != 0L) {
1518          movk(dst, (u_int32_t)imm_h[i], (i << 4));
1519        }
1520      }
1521    } else if (neg_count == 2) {
1522      // one MOVN and one MOVK will do
1523      for (i = 0; i < 4; i++) {
1524        if (imm_h[i] != 0xffffL) {
1525          movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1526          i++;
1527          break;
1528        }
1529      }
1530      for (;i < 4; i++) {
1531        if (imm_h[i] != 0xffffL) {
1532          movk(dst, (u_int32_t)imm_h[i], (i << 4));
1533        }
1534      }
1535    } else if (zero_count == 1) {
1536      // one MOVZ and two MOVKs will do
1537      for (i = 0; i < 4; i++) {
1538        if (imm_h[i] != 0L) {
1539          movz(dst, (u_int32_t)imm_h[i], (i << 4));
1540          i++;
1541          break;
1542        }
1543      }
1544      for (;i < 4; i++) {
1545        if (imm_h[i] != 0x0L) {
1546          movk(dst, (u_int32_t)imm_h[i], (i << 4));
1547        }
1548      }
1549    } else if (neg_count == 1) {
1550      // one MOVN and two MOVKs will do
1551      for (i = 0; i < 4; i++) {
1552        if (imm_h[i] != 0xffffL) {
1553          movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1554          i++;
1555          break;
1556        }
1557      }
1558      for (;i < 4; i++) {
1559        if (imm_h[i] != 0xffffL) {
1560          movk(dst, (u_int32_t)imm_h[i], (i << 4));
1561        }
1562      }
1563    } else {
1564      // use a MOVZ and 3 MOVKs (makes it easier to debug)
1565      movz(dst, (u_int32_t)imm_h[0], 0);
1566      for (i = 1; i < 4; i++) {
1567        movk(dst, (u_int32_t)imm_h[i], (i << 4));
1568      }
1569    }
1570  }
1571}
1572
1573void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1574{
1575#ifndef PRODUCT
1576    {
1577      char buffer[64];
1578      snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1579      block_comment(buffer);
1580    }
1581#endif
1582  if (operand_valid_for_logical_immediate(true, imm32)) {
1583    orrw(dst, zr, imm32);
1584  } else {
1585    // we can use MOVZ, MOVN or two calls to MOVK to build up the
1586    // constant
1587    u_int32_t imm_h[2];
1588    imm_h[0] = imm32 & 0xffff;
1589    imm_h[1] = ((imm32 >> 16) & 0xffff);
1590    if (imm_h[0] == 0) {
1591      movzw(dst, imm_h[1], 16);
1592    } else if (imm_h[0] == 0xffff) {
1593      movnw(dst, imm_h[1] ^ 0xffff, 16);
1594    } else if (imm_h[1] == 0) {
1595      movzw(dst, imm_h[0], 0);
1596    } else if (imm_h[1] == 0xffff) {
1597      movnw(dst, imm_h[0] ^ 0xffff, 0);
1598    } else {
1599      // use a MOVZ and MOVK (makes it easier to debug)
1600      movzw(dst, imm_h[0], 0);
1601      movkw(dst, imm_h[1], 16);
1602    }
1603  }
1604}
1605
1606// Form an address from base + offset in Rd.  Rd may or may
1607// not actually be used: you must use the Address that is returned.
1608// It is up to you to ensure that the shift provided matches the size
1609// of your data.
1610Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1611  if (Address::offset_ok_for_immed(byte_offset, shift))
1612    // It fits; no need for any heroics
1613    return Address(base, byte_offset);
1614
1615  // Don't do anything clever with negative or misaligned offsets
1616  unsigned mask = (1 << shift) - 1;
1617  if (byte_offset < 0 || byte_offset & mask) {
1618    mov(Rd, byte_offset);
1619    add(Rd, base, Rd);
1620    return Address(Rd);
1621  }
1622
1623  // See if we can do this with two 12-bit offsets
1624  {
1625    unsigned long word_offset = byte_offset >> shift;
1626    unsigned long masked_offset = word_offset & 0xfff000;
1627    if (Address::offset_ok_for_immed(word_offset - masked_offset)
1628        && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1629      add(Rd, base, masked_offset << shift);
1630      word_offset -= masked_offset;
1631      return Address(Rd, word_offset << shift);
1632    }
1633  }
1634
1635  // Do it the hard way
1636  mov(Rd, byte_offset);
1637  add(Rd, base, Rd);
1638  return Address(Rd);
1639}
1640
1641void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1642  Label retry_load;
1643  bind(retry_load);
1644  // flush and load exclusive from the memory location
1645  ldxrw(tmp, counter_addr);
1646  addw(tmp, tmp, 1);
1647  // if we store+flush with no intervening write tmp wil be zero
1648  stxrw(tmp2, tmp, counter_addr);
1649  cbnzw(tmp2, retry_load);
1650}
1651
1652
1653int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1654                                    bool want_remainder, Register scratch)
1655{
1656  // Full implementation of Java idiv and irem.  The function
1657  // returns the (pc) offset of the div instruction - may be needed
1658  // for implicit exceptions.
1659  //
1660  // constraint : ra/rb =/= scratch
1661  //         normal case
1662  //
1663  // input : ra: dividend
1664  //         rb: divisor
1665  //
1666  // result: either
1667  //         quotient  (= ra idiv rb)
1668  //         remainder (= ra irem rb)
1669
1670  assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1671
1672  int idivl_offset = offset();
1673  if (! want_remainder) {
1674    sdivw(result, ra, rb);
1675  } else {
1676    sdivw(scratch, ra, rb);
1677    Assembler::msubw(result, scratch, rb, ra);
1678  }
1679
1680  return idivl_offset;
1681}
1682
1683int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1684                                    bool want_remainder, Register scratch)
1685{
1686  // Full implementation of Java ldiv and lrem.  The function
1687  // returns the (pc) offset of the div instruction - may be needed
1688  // for implicit exceptions.
1689  //
1690  // constraint : ra/rb =/= scratch
1691  //         normal case
1692  //
1693  // input : ra: dividend
1694  //         rb: divisor
1695  //
1696  // result: either
1697  //         quotient  (= ra idiv rb)
1698  //         remainder (= ra irem rb)
1699
1700  assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1701
1702  int idivq_offset = offset();
1703  if (! want_remainder) {
1704    sdiv(result, ra, rb);
1705  } else {
1706    sdiv(scratch, ra, rb);
1707    Assembler::msub(result, scratch, rb, ra);
1708  }
1709
1710  return idivq_offset;
1711}
1712
1713void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1714  address prev = pc() - NativeMembar::instruction_size;
1715  if (prev == code()->last_membar()) {
1716    NativeMembar *bar = NativeMembar_at(prev);
1717    // We are merging two memory barrier instructions.  On AArch64 we
1718    // can do this simply by ORing them together.
1719    bar->set_kind(bar->get_kind() | order_constraint);
1720    BLOCK_COMMENT("merged membar");
1721  } else {
1722    code()->set_last_membar(pc());
1723    dmb(Assembler::barrier(order_constraint));
1724  }
1725}
1726
1727// MacroAssembler routines found actually to be needed
1728
1729void MacroAssembler::push(Register src)
1730{
1731  str(src, Address(pre(esp, -1 * wordSize)));
1732}
1733
1734void MacroAssembler::pop(Register dst)
1735{
1736  ldr(dst, Address(post(esp, 1 * wordSize)));
1737}
1738
1739// Note: load_unsigned_short used to be called load_unsigned_word.
1740int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1741  int off = offset();
1742  ldrh(dst, src);
1743  return off;
1744}
1745
1746int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1747  int off = offset();
1748  ldrb(dst, src);
1749  return off;
1750}
1751
1752int MacroAssembler::load_signed_short(Register dst, Address src) {
1753  int off = offset();
1754  ldrsh(dst, src);
1755  return off;
1756}
1757
1758int MacroAssembler::load_signed_byte(Register dst, Address src) {
1759  int off = offset();
1760  ldrsb(dst, src);
1761  return off;
1762}
1763
1764int MacroAssembler::load_signed_short32(Register dst, Address src) {
1765  int off = offset();
1766  ldrshw(dst, src);
1767  return off;
1768}
1769
1770int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1771  int off = offset();
1772  ldrsbw(dst, src);
1773  return off;
1774}
1775
1776void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1777  switch (size_in_bytes) {
1778  case  8:  ldr(dst, src); break;
1779  case  4:  ldrw(dst, src); break;
1780  case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1781  case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1782  default:  ShouldNotReachHere();
1783  }
1784}
1785
1786void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1787  switch (size_in_bytes) {
1788  case  8:  str(src, dst); break;
1789  case  4:  strw(src, dst); break;
1790  case  2:  strh(src, dst); break;
1791  case  1:  strb(src, dst); break;
1792  default:  ShouldNotReachHere();
1793  }
1794}
1795
1796void MacroAssembler::decrementw(Register reg, int value)
1797{
1798  if (value < 0)  { incrementw(reg, -value);      return; }
1799  if (value == 0) {                               return; }
1800  if (value < (1 << 12)) { subw(reg, reg, value); return; }
1801  /* else */ {
1802    guarantee(reg != rscratch2, "invalid dst for register decrement");
1803    movw(rscratch2, (unsigned)value);
1804    subw(reg, reg, rscratch2);
1805  }
1806}
1807
1808void MacroAssembler::decrement(Register reg, int value)
1809{
1810  if (value < 0)  { increment(reg, -value);      return; }
1811  if (value == 0) {                              return; }
1812  if (value < (1 << 12)) { sub(reg, reg, value); return; }
1813  /* else */ {
1814    assert(reg != rscratch2, "invalid dst for register decrement");
1815    mov(rscratch2, (unsigned long)value);
1816    sub(reg, reg, rscratch2);
1817  }
1818}
1819
1820void MacroAssembler::decrementw(Address dst, int value)
1821{
1822  assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1823  ldrw(rscratch1, dst);
1824  decrementw(rscratch1, value);
1825  strw(rscratch1, dst);
1826}
1827
1828void MacroAssembler::decrement(Address dst, int value)
1829{
1830  assert(!dst.uses(rscratch1), "invalid address for decrement");
1831  ldr(rscratch1, dst);
1832  decrement(rscratch1, value);
1833  str(rscratch1, dst);
1834}
1835
1836void MacroAssembler::incrementw(Register reg, int value)
1837{
1838  if (value < 0)  { decrementw(reg, -value);      return; }
1839  if (value == 0) {                               return; }
1840  if (value < (1 << 12)) { addw(reg, reg, value); return; }
1841  /* else */ {
1842    assert(reg != rscratch2, "invalid dst for register increment");
1843    movw(rscratch2, (unsigned)value);
1844    addw(reg, reg, rscratch2);
1845  }
1846}
1847
1848void MacroAssembler::increment(Register reg, int value)
1849{
1850  if (value < 0)  { decrement(reg, -value);      return; }
1851  if (value == 0) {                              return; }
1852  if (value < (1 << 12)) { add(reg, reg, value); return; }
1853  /* else */ {
1854    assert(reg != rscratch2, "invalid dst for register increment");
1855    movw(rscratch2, (unsigned)value);
1856    add(reg, reg, rscratch2);
1857  }
1858}
1859
1860void MacroAssembler::incrementw(Address dst, int value)
1861{
1862  assert(!dst.uses(rscratch1), "invalid dst for address increment");
1863  ldrw(rscratch1, dst);
1864  incrementw(rscratch1, value);
1865  strw(rscratch1, dst);
1866}
1867
1868void MacroAssembler::increment(Address dst, int value)
1869{
1870  assert(!dst.uses(rscratch1), "invalid dst for address increment");
1871  ldr(rscratch1, dst);
1872  increment(rscratch1, value);
1873  str(rscratch1, dst);
1874}
1875
1876
1877void MacroAssembler::pusha() {
1878  push(0x7fffffff, sp);
1879}
1880
1881void MacroAssembler::popa() {
1882  pop(0x7fffffff, sp);
1883}
1884
1885// Push lots of registers in the bit set supplied.  Don't push sp.
1886// Return the number of words pushed
1887int MacroAssembler::push(unsigned int bitset, Register stack) {
1888  int words_pushed = 0;
1889
1890  // Scan bitset to accumulate register pairs
1891  unsigned char regs[32];
1892  int count = 0;
1893  for (int reg = 0; reg <= 30; reg++) {
1894    if (1 & bitset)
1895      regs[count++] = reg;
1896    bitset >>= 1;
1897  }
1898  regs[count++] = zr->encoding_nocheck();
1899  count &= ~1;  // Only push an even nuber of regs
1900
1901  if (count) {
1902    stp(as_Register(regs[0]), as_Register(regs[1]),
1903       Address(pre(stack, -count * wordSize)));
1904    words_pushed += 2;
1905  }
1906  for (int i = 2; i < count; i += 2) {
1907    stp(as_Register(regs[i]), as_Register(regs[i+1]),
1908       Address(stack, i * wordSize));
1909    words_pushed += 2;
1910  }
1911
1912  assert(words_pushed == count, "oops, pushed != count");
1913
1914  return count;
1915}
1916
1917int MacroAssembler::pop(unsigned int bitset, Register stack) {
1918  int words_pushed = 0;
1919
1920  // Scan bitset to accumulate register pairs
1921  unsigned char regs[32];
1922  int count = 0;
1923  for (int reg = 0; reg <= 30; reg++) {
1924    if (1 & bitset)
1925      regs[count++] = reg;
1926    bitset >>= 1;
1927  }
1928  regs[count++] = zr->encoding_nocheck();
1929  count &= ~1;
1930
1931  for (int i = 2; i < count; i += 2) {
1932    ldp(as_Register(regs[i]), as_Register(regs[i+1]),
1933       Address(stack, i * wordSize));
1934    words_pushed += 2;
1935  }
1936  if (count) {
1937    ldp(as_Register(regs[0]), as_Register(regs[1]),
1938       Address(post(stack, count * wordSize)));
1939    words_pushed += 2;
1940  }
1941
1942  assert(words_pushed == count, "oops, pushed != count");
1943
1944  return count;
1945}
1946#ifdef ASSERT
1947void MacroAssembler::verify_heapbase(const char* msg) {
1948#if 0
1949  assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
1950  assert (Universe::heap() != NULL, "java heap should be initialized");
1951  if (CheckCompressedOops) {
1952    Label ok;
1953    push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
1954    cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
1955    br(Assembler::EQ, ok);
1956    stop(msg);
1957    bind(ok);
1958    pop(1 << rscratch1->encoding(), sp);
1959  }
1960#endif
1961}
1962#endif
1963
1964void MacroAssembler::stop(const char* msg) {
1965  address ip = pc();
1966  pusha();
1967  mov(c_rarg0, (address)msg);
1968  mov(c_rarg1, (address)ip);
1969  mov(c_rarg2, sp);
1970  mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
1971  // call(c_rarg3);
1972  blrt(c_rarg3, 3, 0, 1);
1973  hlt(0);
1974}
1975
1976// If a constant does not fit in an immediate field, generate some
1977// number of MOV instructions and then perform the operation.
1978void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
1979                                           add_sub_imm_insn insn1,
1980                                           add_sub_reg_insn insn2) {
1981  assert(Rd != zr, "Rd = zr and not setting flags?");
1982  if (operand_valid_for_add_sub_immediate((int)imm)) {
1983    (this->*insn1)(Rd, Rn, imm);
1984  } else {
1985    if (uabs(imm) < (1 << 24)) {
1986       (this->*insn1)(Rd, Rn, imm & -(1 << 12));
1987       (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
1988    } else {
1989       assert_different_registers(Rd, Rn);
1990       mov(Rd, (uint64_t)imm);
1991       (this->*insn2)(Rd, Rn, Rd, LSL, 0);
1992    }
1993  }
1994}
1995
1996// Seperate vsn which sets the flags. Optimisations are more restricted
1997// because we must set the flags correctly.
1998void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
1999                                           add_sub_imm_insn insn1,
2000                                           add_sub_reg_insn insn2) {
2001  if (operand_valid_for_add_sub_immediate((int)imm)) {
2002    (this->*insn1)(Rd, Rn, imm);
2003  } else {
2004    assert_different_registers(Rd, Rn);
2005    assert(Rd != zr, "overflow in immediate operand");
2006    mov(Rd, (uint64_t)imm);
2007    (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2008  }
2009}
2010
2011
2012void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2013  if (increment.is_register()) {
2014    add(Rd, Rn, increment.as_register());
2015  } else {
2016    add(Rd, Rn, increment.as_constant());
2017  }
2018}
2019
2020void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2021  if (increment.is_register()) {
2022    addw(Rd, Rn, increment.as_register());
2023  } else {
2024    addw(Rd, Rn, increment.as_constant());
2025  }
2026}
2027
2028void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2029  if (decrement.is_register()) {
2030    sub(Rd, Rn, decrement.as_register());
2031  } else {
2032    sub(Rd, Rn, decrement.as_constant());
2033  }
2034}
2035
2036void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2037  if (decrement.is_register()) {
2038    subw(Rd, Rn, decrement.as_register());
2039  } else {
2040    subw(Rd, Rn, decrement.as_constant());
2041  }
2042}
2043
2044void MacroAssembler::reinit_heapbase()
2045{
2046  if (UseCompressedOops) {
2047    if (Universe::is_fully_initialized()) {
2048      mov(rheapbase, Universe::narrow_ptrs_base());
2049    } else {
2050      lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2051      ldr(rheapbase, Address(rheapbase));
2052    }
2053  }
2054}
2055
2056// this simulates the behaviour of the x86 cmpxchg instruction using a
2057// load linked/store conditional pair. we use the acquire/release
2058// versions of these instructions so that we flush pending writes as
2059// per Java semantics.
2060
2061// n.b the x86 version assumes the old value to be compared against is
2062// in rax and updates rax with the value located in memory if the
2063// cmpxchg fails. we supply a register for the old value explicitly
2064
2065// the aarch64 load linked/store conditional instructions do not
2066// accept an offset. so, unlike x86, we must provide a plain register
2067// to identify the memory word to be compared/exchanged rather than a
2068// register+offset Address.
2069
2070void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2071                                Label &succeed, Label *fail) {
2072  // oldv holds comparison value
2073  // newv holds value to write in exchange
2074  // addr identifies memory word to compare against/update
2075  // tmp returns 0/1 for success/failure
2076  Label retry_load, nope;
2077
2078  bind(retry_load);
2079  // flush and load exclusive from the memory location
2080  // and fail if it is not what we expect
2081  ldaxr(tmp, addr);
2082  cmp(tmp, oldv);
2083  br(Assembler::NE, nope);
2084  // if we store+flush with no intervening write tmp wil be zero
2085  stlxr(tmp, newv, addr);
2086  cbzw(tmp, succeed);
2087  // retry so we only ever return after a load fails to compare
2088  // ensures we don't return a stale value after a failed write.
2089  b(retry_load);
2090  // if the memory word differs we return it in oldv and signal a fail
2091  bind(nope);
2092  membar(AnyAny);
2093  mov(oldv, tmp);
2094  if (fail)
2095    b(*fail);
2096}
2097
2098void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2099                                Label &succeed, Label *fail) {
2100  // oldv holds comparison value
2101  // newv holds value to write in exchange
2102  // addr identifies memory word to compare against/update
2103  // tmp returns 0/1 for success/failure
2104  Label retry_load, nope;
2105
2106  bind(retry_load);
2107  // flush and load exclusive from the memory location
2108  // and fail if it is not what we expect
2109  ldaxrw(tmp, addr);
2110  cmp(tmp, oldv);
2111  br(Assembler::NE, nope);
2112  // if we store+flush with no intervening write tmp wil be zero
2113  stlxrw(tmp, newv, addr);
2114  cbzw(tmp, succeed);
2115  // retry so we only ever return after a load fails to compare
2116  // ensures we don't return a stale value after a failed write.
2117  b(retry_load);
2118  // if the memory word differs we return it in oldv and signal a fail
2119  bind(nope);
2120  membar(AnyAny);
2121  mov(oldv, tmp);
2122  if (fail)
2123    b(*fail);
2124}
2125
2126static bool different(Register a, RegisterOrConstant b, Register c) {
2127  if (b.is_constant())
2128    return a != c;
2129  else
2130    return a != b.as_register() && a != c && b.as_register() != c;
2131}
2132
2133#define ATOMIC_OP(LDXR, OP, IOP, STXR)                                       \
2134void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
2135  Register result = rscratch2;                                          \
2136  if (prev->is_valid())                                                 \
2137    result = different(prev, incr, addr) ? prev : rscratch2;            \
2138                                                                        \
2139  Label retry_load;                                                     \
2140  bind(retry_load);                                                     \
2141  LDXR(result, addr);                                                   \
2142  OP(rscratch1, result, incr);                                          \
2143  STXR(rscratch2, rscratch1, addr);                                     \
2144  cbnzw(rscratch2, retry_load);                                         \
2145  if (prev->is_valid() && prev != result) {                             \
2146    IOP(prev, rscratch1, incr);                                         \
2147  }                                                                     \
2148}
2149
2150ATOMIC_OP(ldxr, add, sub, stxr)
2151ATOMIC_OP(ldxrw, addw, subw, stxrw)
2152
2153#undef ATOMIC_OP
2154
2155#define ATOMIC_XCHG(OP, LDXR, STXR)                                     \
2156void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2157  Register result = rscratch2;                                          \
2158  if (prev->is_valid())                                                 \
2159    result = different(prev, newv, addr) ? prev : rscratch2;            \
2160                                                                        \
2161  Label retry_load;                                                     \
2162  bind(retry_load);                                                     \
2163  LDXR(result, addr);                                                   \
2164  STXR(rscratch1, newv, addr);                                          \
2165  cbnzw(rscratch1, retry_load);                                         \
2166  if (prev->is_valid() && prev != result)                               \
2167    mov(prev, result);                                                  \
2168}
2169
2170ATOMIC_XCHG(xchg, ldxr, stxr)
2171ATOMIC_XCHG(xchgw, ldxrw, stxrw)
2172
2173#undef ATOMIC_XCHG
2174
2175void MacroAssembler::incr_allocated_bytes(Register thread,
2176                                          Register var_size_in_bytes,
2177                                          int con_size_in_bytes,
2178                                          Register t1) {
2179  if (!thread->is_valid()) {
2180    thread = rthread;
2181  }
2182  assert(t1->is_valid(), "need temp reg");
2183
2184  ldr(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2185  if (var_size_in_bytes->is_valid()) {
2186    add(t1, t1, var_size_in_bytes);
2187  } else {
2188    add(t1, t1, con_size_in_bytes);
2189  }
2190  str(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2191}
2192
2193#ifndef PRODUCT
2194extern "C" void findpc(intptr_t x);
2195#endif
2196
2197void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2198{
2199  // In order to get locks to work, we need to fake a in_VM state
2200  if (ShowMessageBoxOnError ) {
2201    JavaThread* thread = JavaThread::current();
2202    JavaThreadState saved_state = thread->thread_state();
2203    thread->set_thread_state(_thread_in_vm);
2204#ifndef PRODUCT
2205    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2206      ttyLocker ttyl;
2207      BytecodeCounter::print();
2208    }
2209#endif
2210    if (os::message_box(msg, "Execution stopped, print registers?")) {
2211      ttyLocker ttyl;
2212      tty->print_cr(" pc = 0x%016lx", pc);
2213#ifndef PRODUCT
2214      tty->cr();
2215      findpc(pc);
2216      tty->cr();
2217#endif
2218      tty->print_cr(" r0 = 0x%016lx", regs[0]);
2219      tty->print_cr(" r1 = 0x%016lx", regs[1]);
2220      tty->print_cr(" r2 = 0x%016lx", regs[2]);
2221      tty->print_cr(" r3 = 0x%016lx", regs[3]);
2222      tty->print_cr(" r4 = 0x%016lx", regs[4]);
2223      tty->print_cr(" r5 = 0x%016lx", regs[5]);
2224      tty->print_cr(" r6 = 0x%016lx", regs[6]);
2225      tty->print_cr(" r7 = 0x%016lx", regs[7]);
2226      tty->print_cr(" r8 = 0x%016lx", regs[8]);
2227      tty->print_cr(" r9 = 0x%016lx", regs[9]);
2228      tty->print_cr("r10 = 0x%016lx", regs[10]);
2229      tty->print_cr("r11 = 0x%016lx", regs[11]);
2230      tty->print_cr("r12 = 0x%016lx", regs[12]);
2231      tty->print_cr("r13 = 0x%016lx", regs[13]);
2232      tty->print_cr("r14 = 0x%016lx", regs[14]);
2233      tty->print_cr("r15 = 0x%016lx", regs[15]);
2234      tty->print_cr("r16 = 0x%016lx", regs[16]);
2235      tty->print_cr("r17 = 0x%016lx", regs[17]);
2236      tty->print_cr("r18 = 0x%016lx", regs[18]);
2237      tty->print_cr("r19 = 0x%016lx", regs[19]);
2238      tty->print_cr("r20 = 0x%016lx", regs[20]);
2239      tty->print_cr("r21 = 0x%016lx", regs[21]);
2240      tty->print_cr("r22 = 0x%016lx", regs[22]);
2241      tty->print_cr("r23 = 0x%016lx", regs[23]);
2242      tty->print_cr("r24 = 0x%016lx", regs[24]);
2243      tty->print_cr("r25 = 0x%016lx", regs[25]);
2244      tty->print_cr("r26 = 0x%016lx", regs[26]);
2245      tty->print_cr("r27 = 0x%016lx", regs[27]);
2246      tty->print_cr("r28 = 0x%016lx", regs[28]);
2247      tty->print_cr("r30 = 0x%016lx", regs[30]);
2248      tty->print_cr("r31 = 0x%016lx", regs[31]);
2249      BREAKPOINT;
2250    }
2251    ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2252  } else {
2253    ttyLocker ttyl;
2254    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2255                    msg);
2256    assert(false, "DEBUG MESSAGE: %s", msg);
2257  }
2258}
2259
2260#ifdef BUILTIN_SIM
2261// routine to generate an x86 prolog for a stub function which
2262// bootstraps into the generated ARM code which directly follows the
2263// stub
2264//
2265// the argument encodes the number of general and fp registers
2266// passed by the caller and the callng convention (currently just
2267// the number of general registers and assumes C argument passing)
2268
2269extern "C" {
2270int aarch64_stub_prolog_size();
2271void aarch64_stub_prolog();
2272void aarch64_prolog();
2273}
2274
2275void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2276                                   address *prolog_ptr)
2277{
2278  int calltype = (((ret_type & 0x3) << 8) |
2279                  ((fp_arg_count & 0xf) << 4) |
2280                  (gp_arg_count & 0xf));
2281
2282  // the addresses for the x86 to ARM entry code we need to use
2283  address start = pc();
2284  // printf("start = %lx\n", start);
2285  int byteCount =  aarch64_stub_prolog_size();
2286  // printf("byteCount = %x\n", byteCount);
2287  int instructionCount = (byteCount + 3)/ 4;
2288  // printf("instructionCount = %x\n", instructionCount);
2289  for (int i = 0; i < instructionCount; i++) {
2290    nop();
2291  }
2292
2293  memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2294
2295  // write the address of the setup routine and the call format at the
2296  // end of into the copied code
2297  u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2298  if (prolog_ptr)
2299    patch_end[-2] = (u_int64_t)prolog_ptr;
2300  patch_end[-1] = calltype;
2301}
2302#endif
2303
2304void MacroAssembler::push_CPU_state(bool save_vectors) {
2305  push(0x3fffffff, sp);         // integer registers except lr & sp
2306
2307  if (!save_vectors) {
2308    for (int i = 30; i >= 0; i -= 2)
2309      stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2310           Address(pre(sp, -2 * wordSize)));
2311  } else {
2312    for (int i = 30; i >= 0; i -= 2)
2313      stpq(as_FloatRegister(i), as_FloatRegister(i+1),
2314           Address(pre(sp, -4 * wordSize)));
2315  }
2316}
2317
2318void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2319  if (!restore_vectors) {
2320    for (int i = 0; i < 32; i += 2)
2321      ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2322           Address(post(sp, 2 * wordSize)));
2323  } else {
2324    for (int i = 0; i < 32; i += 2)
2325      ldpq(as_FloatRegister(i), as_FloatRegister(i+1),
2326           Address(post(sp, 4 * wordSize)));
2327  }
2328
2329  pop(0x3fffffff, sp);         // integer registers except lr & sp
2330}
2331
2332/**
2333 * Helpers for multiply_to_len().
2334 */
2335void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2336                                     Register src1, Register src2) {
2337  adds(dest_lo, dest_lo, src1);
2338  adc(dest_hi, dest_hi, zr);
2339  adds(dest_lo, dest_lo, src2);
2340  adc(final_dest_hi, dest_hi, zr);
2341}
2342
2343// Generate an address from (r + r1 extend offset).  "size" is the
2344// size of the operand.  The result may be in rscratch2.
2345Address MacroAssembler::offsetted_address(Register r, Register r1,
2346                                          Address::extend ext, int offset, int size) {
2347  if (offset || (ext.shift() % size != 0)) {
2348    lea(rscratch2, Address(r, r1, ext));
2349    return Address(rscratch2, offset);
2350  } else {
2351    return Address(r, r1, ext);
2352  }
2353}
2354
2355Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2356{
2357  assert(offset >= 0, "spill to negative address?");
2358  // Offset reachable ?
2359  //   Not aligned - 9 bits signed offset
2360  //   Aligned - 12 bits unsigned offset shifted
2361  Register base = sp;
2362  if ((offset & (size-1)) && offset >= (1<<8)) {
2363    add(tmp, base, offset & ((1<<12)-1));
2364    base = tmp;
2365    offset &= -1<<12;
2366  }
2367
2368  if (offset >= (1<<12) * size) {
2369    add(tmp, base, offset & (((1<<12)-1)<<12));
2370    base = tmp;
2371    offset &= ~(((1<<12)-1)<<12);
2372  }
2373
2374  return Address(base, offset);
2375}
2376
2377/**
2378 * Multiply 64 bit by 64 bit first loop.
2379 */
2380void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2381                                           Register y, Register y_idx, Register z,
2382                                           Register carry, Register product,
2383                                           Register idx, Register kdx) {
2384  //
2385  //  jlong carry, x[], y[], z[];
2386  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2387  //    huge_128 product = y[idx] * x[xstart] + carry;
2388  //    z[kdx] = (jlong)product;
2389  //    carry  = (jlong)(product >>> 64);
2390  //  }
2391  //  z[xstart] = carry;
2392  //
2393
2394  Label L_first_loop, L_first_loop_exit;
2395  Label L_one_x, L_one_y, L_multiply;
2396
2397  subsw(xstart, xstart, 1);
2398  br(Assembler::MI, L_one_x);
2399
2400  lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2401  ldr(x_xstart, Address(rscratch1));
2402  ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2403
2404  bind(L_first_loop);
2405  subsw(idx, idx, 1);
2406  br(Assembler::MI, L_first_loop_exit);
2407  subsw(idx, idx, 1);
2408  br(Assembler::MI, L_one_y);
2409  lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2410  ldr(y_idx, Address(rscratch1));
2411  ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2412  bind(L_multiply);
2413
2414  // AArch64 has a multiply-accumulate instruction that we can't use
2415  // here because it has no way to process carries, so we have to use
2416  // separate add and adc instructions.  Bah.
2417  umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2418  mul(product, x_xstart, y_idx);
2419  adds(product, product, carry);
2420  adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2421
2422  subw(kdx, kdx, 2);
2423  ror(product, product, 32); // back to big-endian
2424  str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2425
2426  b(L_first_loop);
2427
2428  bind(L_one_y);
2429  ldrw(y_idx, Address(y,  0));
2430  b(L_multiply);
2431
2432  bind(L_one_x);
2433  ldrw(x_xstart, Address(x,  0));
2434  b(L_first_loop);
2435
2436  bind(L_first_loop_exit);
2437}
2438
2439/**
2440 * Multiply 128 bit by 128. Unrolled inner loop.
2441 *
2442 */
2443void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2444                                             Register carry, Register carry2,
2445                                             Register idx, Register jdx,
2446                                             Register yz_idx1, Register yz_idx2,
2447                                             Register tmp, Register tmp3, Register tmp4,
2448                                             Register tmp6, Register product_hi) {
2449
2450  //   jlong carry, x[], y[], z[];
2451  //   int kdx = ystart+1;
2452  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2453  //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2454  //     jlong carry2  = (jlong)(tmp3 >>> 64);
2455  //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2456  //     carry  = (jlong)(tmp4 >>> 64);
2457  //     z[kdx+idx+1] = (jlong)tmp3;
2458  //     z[kdx+idx] = (jlong)tmp4;
2459  //   }
2460  //   idx += 2;
2461  //   if (idx > 0) {
2462  //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2463  //     z[kdx+idx] = (jlong)yz_idx1;
2464  //     carry  = (jlong)(yz_idx1 >>> 64);
2465  //   }
2466  //
2467
2468  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2469
2470  lsrw(jdx, idx, 2);
2471
2472  bind(L_third_loop);
2473
2474  subsw(jdx, jdx, 1);
2475  br(Assembler::MI, L_third_loop_exit);
2476  subw(idx, idx, 4);
2477
2478  lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2479
2480  ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2481
2482  lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2483
2484  ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2485  ror(yz_idx2, yz_idx2, 32);
2486
2487  ldp(rscratch2, rscratch1, Address(tmp6, 0));
2488
2489  mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2490  umulh(tmp4, product_hi, yz_idx1);
2491
2492  ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2493  ror(rscratch2, rscratch2, 32);
2494
2495  mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2496  umulh(carry2, product_hi, yz_idx2);
2497
2498  // propagate sum of both multiplications into carry:tmp4:tmp3
2499  adds(tmp3, tmp3, carry);
2500  adc(tmp4, tmp4, zr);
2501  adds(tmp3, tmp3, rscratch1);
2502  adcs(tmp4, tmp4, tmp);
2503  adc(carry, carry2, zr);
2504  adds(tmp4, tmp4, rscratch2);
2505  adc(carry, carry, zr);
2506
2507  ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2508  ror(tmp4, tmp4, 32);
2509  stp(tmp4, tmp3, Address(tmp6, 0));
2510
2511  b(L_third_loop);
2512  bind (L_third_loop_exit);
2513
2514  andw (idx, idx, 0x3);
2515  cbz(idx, L_post_third_loop_done);
2516
2517  Label L_check_1;
2518  subsw(idx, idx, 2);
2519  br(Assembler::MI, L_check_1);
2520
2521  lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2522  ldr(yz_idx1, Address(rscratch1, 0));
2523  ror(yz_idx1, yz_idx1, 32);
2524  mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2525  umulh(tmp4, product_hi, yz_idx1);
2526  lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2527  ldr(yz_idx2, Address(rscratch1, 0));
2528  ror(yz_idx2, yz_idx2, 32);
2529
2530  add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2531
2532  ror(tmp3, tmp3, 32);
2533  str(tmp3, Address(rscratch1, 0));
2534
2535  bind (L_check_1);
2536
2537  andw (idx, idx, 0x1);
2538  subsw(idx, idx, 1);
2539  br(Assembler::MI, L_post_third_loop_done);
2540  ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2541  mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2542  umulh(carry2, tmp4, product_hi);
2543  ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2544
2545  add2_with_carry(carry2, tmp3, tmp4, carry);
2546
2547  strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2548  extr(carry, carry2, tmp3, 32);
2549
2550  bind(L_post_third_loop_done);
2551}
2552
2553/**
2554 * Code for BigInteger::multiplyToLen() instrinsic.
2555 *
2556 * r0: x
2557 * r1: xlen
2558 * r2: y
2559 * r3: ylen
2560 * r4:  z
2561 * r5: zlen
2562 * r10: tmp1
2563 * r11: tmp2
2564 * r12: tmp3
2565 * r13: tmp4
2566 * r14: tmp5
2567 * r15: tmp6
2568 * r16: tmp7
2569 *
2570 */
2571void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2572                                     Register z, Register zlen,
2573                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2574                                     Register tmp5, Register tmp6, Register product_hi) {
2575
2576  assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2577
2578  const Register idx = tmp1;
2579  const Register kdx = tmp2;
2580  const Register xstart = tmp3;
2581
2582  const Register y_idx = tmp4;
2583  const Register carry = tmp5;
2584  const Register product  = xlen;
2585  const Register x_xstart = zlen;  // reuse register
2586
2587  // First Loop.
2588  //
2589  //  final static long LONG_MASK = 0xffffffffL;
2590  //  int xstart = xlen - 1;
2591  //  int ystart = ylen - 1;
2592  //  long carry = 0;
2593  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2594  //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
2595  //    z[kdx] = (int)product;
2596  //    carry = product >>> 32;
2597  //  }
2598  //  z[xstart] = (int)carry;
2599  //
2600
2601  movw(idx, ylen);      // idx = ylen;
2602  movw(kdx, zlen);      // kdx = xlen+ylen;
2603  mov(carry, zr);       // carry = 0;
2604
2605  Label L_done;
2606
2607  movw(xstart, xlen);
2608  subsw(xstart, xstart, 1);
2609  br(Assembler::MI, L_done);
2610
2611  multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
2612
2613  Label L_second_loop;
2614  cbzw(kdx, L_second_loop);
2615
2616  Label L_carry;
2617  subw(kdx, kdx, 1);
2618  cbzw(kdx, L_carry);
2619
2620  strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2621  lsr(carry, carry, 32);
2622  subw(kdx, kdx, 1);
2623
2624  bind(L_carry);
2625  strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2626
2627  // Second and third (nested) loops.
2628  //
2629  // for (int i = xstart-1; i >= 0; i--) { // Second loop
2630  //   carry = 0;
2631  //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
2632  //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
2633  //                    (z[k] & LONG_MASK) + carry;
2634  //     z[k] = (int)product;
2635  //     carry = product >>> 32;
2636  //   }
2637  //   z[i] = (int)carry;
2638  // }
2639  //
2640  // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
2641
2642  const Register jdx = tmp1;
2643
2644  bind(L_second_loop);
2645  mov(carry, zr);                // carry = 0;
2646  movw(jdx, ylen);               // j = ystart+1
2647
2648  subsw(xstart, xstart, 1);      // i = xstart-1;
2649  br(Assembler::MI, L_done);
2650
2651  str(z, Address(pre(sp, -4 * wordSize)));
2652
2653  Label L_last_x;
2654  lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
2655  subsw(xstart, xstart, 1);       // i = xstart-1;
2656  br(Assembler::MI, L_last_x);
2657
2658  lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
2659  ldr(product_hi, Address(rscratch1));
2660  ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
2661
2662  Label L_third_loop_prologue;
2663  bind(L_third_loop_prologue);
2664
2665  str(ylen, Address(sp, wordSize));
2666  stp(x, xstart, Address(sp, 2 * wordSize));
2667  multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
2668                          tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
2669  ldp(z, ylen, Address(post(sp, 2 * wordSize)));
2670  ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
2671
2672  addw(tmp3, xlen, 1);
2673  strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2674  subsw(tmp3, tmp3, 1);
2675  br(Assembler::MI, L_done);
2676
2677  lsr(carry, carry, 32);
2678  strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2679  b(L_second_loop);
2680
2681  // Next infrequent code is moved outside loops.
2682  bind(L_last_x);
2683  ldrw(product_hi, Address(x,  0));
2684  b(L_third_loop_prologue);
2685
2686  bind(L_done);
2687}
2688
2689/**
2690 * Emits code to update CRC-32 with a byte value according to constants in table
2691 *
2692 * @param [in,out]crc   Register containing the crc.
2693 * @param [in]val       Register containing the byte to fold into the CRC.
2694 * @param [in]table     Register containing the table of crc constants.
2695 *
2696 * uint32_t crc;
2697 * val = crc_table[(val ^ crc) & 0xFF];
2698 * crc = val ^ (crc >> 8);
2699 *
2700 */
2701void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2702  eor(val, val, crc);
2703  andr(val, val, 0xff);
2704  ldrw(val, Address(table, val, Address::lsl(2)));
2705  eor(crc, val, crc, Assembler::LSR, 8);
2706}
2707
2708/**
2709 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
2710 *
2711 * @param [in,out]crc   Register containing the crc.
2712 * @param [in]v         Register containing the 32-bit to fold into the CRC.
2713 * @param [in]table0    Register containing table 0 of crc constants.
2714 * @param [in]table1    Register containing table 1 of crc constants.
2715 * @param [in]table2    Register containing table 2 of crc constants.
2716 * @param [in]table3    Register containing table 3 of crc constants.
2717 *
2718 * uint32_t crc;
2719 *   v = crc ^ v
2720 *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
2721 *
2722 */
2723void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
2724        Register table0, Register table1, Register table2, Register table3,
2725        bool upper) {
2726  eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
2727  uxtb(tmp, v);
2728  ldrw(crc, Address(table3, tmp, Address::lsl(2)));
2729  ubfx(tmp, v, 8, 8);
2730  ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
2731  eor(crc, crc, tmp);
2732  ubfx(tmp, v, 16, 8);
2733  ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
2734  eor(crc, crc, tmp);
2735  ubfx(tmp, v, 24, 8);
2736  ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
2737  eor(crc, crc, tmp);
2738}
2739
2740/**
2741 * @param crc   register containing existing CRC (32-bit)
2742 * @param buf   register pointing to input byte buffer (byte*)
2743 * @param len   register containing number of bytes
2744 * @param table register that will contain address of CRC table
2745 * @param tmp   scratch register
2746 */
2747void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2748        Register table0, Register table1, Register table2, Register table3,
2749        Register tmp, Register tmp2, Register tmp3) {
2750  Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
2751  unsigned long offset;
2752
2753    ornw(crc, zr, crc);
2754
2755  if (UseCRC32) {
2756    Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop;
2757
2758      subs(len, len, 64);
2759      br(Assembler::GE, CRC_by64_loop);
2760      adds(len, len, 64-4);
2761      br(Assembler::GE, CRC_by4_loop);
2762      adds(len, len, 4);
2763      br(Assembler::GT, CRC_by1_loop);
2764      b(L_exit);
2765
2766    BIND(CRC_by4_loop);
2767      ldrw(tmp, Address(post(buf, 4)));
2768      subs(len, len, 4);
2769      crc32w(crc, crc, tmp);
2770      br(Assembler::GE, CRC_by4_loop);
2771      adds(len, len, 4);
2772      br(Assembler::LE, L_exit);
2773    BIND(CRC_by1_loop);
2774      ldrb(tmp, Address(post(buf, 1)));
2775      subs(len, len, 1);
2776      crc32b(crc, crc, tmp);
2777      br(Assembler::GT, CRC_by1_loop);
2778      b(L_exit);
2779
2780      align(CodeEntryAlignment);
2781    BIND(CRC_by64_loop);
2782      subs(len, len, 64);
2783      ldp(tmp, tmp3, Address(post(buf, 16)));
2784      crc32x(crc, crc, tmp);
2785      crc32x(crc, crc, tmp3);
2786      ldp(tmp, tmp3, Address(post(buf, 16)));
2787      crc32x(crc, crc, tmp);
2788      crc32x(crc, crc, tmp3);
2789      ldp(tmp, tmp3, Address(post(buf, 16)));
2790      crc32x(crc, crc, tmp);
2791      crc32x(crc, crc, tmp3);
2792      ldp(tmp, tmp3, Address(post(buf, 16)));
2793      crc32x(crc, crc, tmp);
2794      crc32x(crc, crc, tmp3);
2795      br(Assembler::GE, CRC_by64_loop);
2796      adds(len, len, 64-4);
2797      br(Assembler::GE, CRC_by4_loop);
2798      adds(len, len, 4);
2799      br(Assembler::GT, CRC_by1_loop);
2800    BIND(L_exit);
2801      ornw(crc, zr, crc);
2802      return;
2803  }
2804
2805    adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
2806    if (offset) add(table0, table0, offset);
2807    add(table1, table0, 1*256*sizeof(juint));
2808    add(table2, table0, 2*256*sizeof(juint));
2809    add(table3, table0, 3*256*sizeof(juint));
2810
2811  if (UseNeon) {
2812      cmp(len, 64);
2813      br(Assembler::LT, L_by16);
2814      eor(v16, T16B, v16, v16);
2815
2816    Label L_fold;
2817
2818      add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
2819
2820      ld1(v0, v1, T2D, post(buf, 32));
2821      ld1r(v4, T2D, post(tmp, 8));
2822      ld1r(v5, T2D, post(tmp, 8));
2823      ld1r(v6, T2D, post(tmp, 8));
2824      ld1r(v7, T2D, post(tmp, 8));
2825      mov(v16, T4S, 0, crc);
2826
2827      eor(v0, T16B, v0, v16);
2828      sub(len, len, 64);
2829
2830    BIND(L_fold);
2831      pmull(v22, T8H, v0, v5, T8B);
2832      pmull(v20, T8H, v0, v7, T8B);
2833      pmull(v23, T8H, v0, v4, T8B);
2834      pmull(v21, T8H, v0, v6, T8B);
2835
2836      pmull2(v18, T8H, v0, v5, T16B);
2837      pmull2(v16, T8H, v0, v7, T16B);
2838      pmull2(v19, T8H, v0, v4, T16B);
2839      pmull2(v17, T8H, v0, v6, T16B);
2840
2841      uzp1(v24, v20, v22, T8H);
2842      uzp2(v25, v20, v22, T8H);
2843      eor(v20, T16B, v24, v25);
2844
2845      uzp1(v26, v16, v18, T8H);
2846      uzp2(v27, v16, v18, T8H);
2847      eor(v16, T16B, v26, v27);
2848
2849      ushll2(v22, T4S, v20, T8H, 8);
2850      ushll(v20, T4S, v20, T4H, 8);
2851
2852      ushll2(v18, T4S, v16, T8H, 8);
2853      ushll(v16, T4S, v16, T4H, 8);
2854
2855      eor(v22, T16B, v23, v22);
2856      eor(v18, T16B, v19, v18);
2857      eor(v20, T16B, v21, v20);
2858      eor(v16, T16B, v17, v16);
2859
2860      uzp1(v17, v16, v20, T2D);
2861      uzp2(v21, v16, v20, T2D);
2862      eor(v17, T16B, v17, v21);
2863
2864      ushll2(v20, T2D, v17, T4S, 16);
2865      ushll(v16, T2D, v17, T2S, 16);
2866
2867      eor(v20, T16B, v20, v22);
2868      eor(v16, T16B, v16, v18);
2869
2870      uzp1(v17, v20, v16, T2D);
2871      uzp2(v21, v20, v16, T2D);
2872      eor(v28, T16B, v17, v21);
2873
2874      pmull(v22, T8H, v1, v5, T8B);
2875      pmull(v20, T8H, v1, v7, T8B);
2876      pmull(v23, T8H, v1, v4, T8B);
2877      pmull(v21, T8H, v1, v6, T8B);
2878
2879      pmull2(v18, T8H, v1, v5, T16B);
2880      pmull2(v16, T8H, v1, v7, T16B);
2881      pmull2(v19, T8H, v1, v4, T16B);
2882      pmull2(v17, T8H, v1, v6, T16B);
2883
2884      ld1(v0, v1, T2D, post(buf, 32));
2885
2886      uzp1(v24, v20, v22, T8H);
2887      uzp2(v25, v20, v22, T8H);
2888      eor(v20, T16B, v24, v25);
2889
2890      uzp1(v26, v16, v18, T8H);
2891      uzp2(v27, v16, v18, T8H);
2892      eor(v16, T16B, v26, v27);
2893
2894      ushll2(v22, T4S, v20, T8H, 8);
2895      ushll(v20, T4S, v20, T4H, 8);
2896
2897      ushll2(v18, T4S, v16, T8H, 8);
2898      ushll(v16, T4S, v16, T4H, 8);
2899
2900      eor(v22, T16B, v23, v22);
2901      eor(v18, T16B, v19, v18);
2902      eor(v20, T16B, v21, v20);
2903      eor(v16, T16B, v17, v16);
2904
2905      uzp1(v17, v16, v20, T2D);
2906      uzp2(v21, v16, v20, T2D);
2907      eor(v16, T16B, v17, v21);
2908
2909      ushll2(v20, T2D, v16, T4S, 16);
2910      ushll(v16, T2D, v16, T2S, 16);
2911
2912      eor(v20, T16B, v22, v20);
2913      eor(v16, T16B, v16, v18);
2914
2915      uzp1(v17, v20, v16, T2D);
2916      uzp2(v21, v20, v16, T2D);
2917      eor(v20, T16B, v17, v21);
2918
2919      shl(v16, T2D, v28, 1);
2920      shl(v17, T2D, v20, 1);
2921
2922      eor(v0, T16B, v0, v16);
2923      eor(v1, T16B, v1, v17);
2924
2925      subs(len, len, 32);
2926      br(Assembler::GE, L_fold);
2927
2928      mov(crc, 0);
2929      mov(tmp, v0, T1D, 0);
2930      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2931      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2932      mov(tmp, v0, T1D, 1);
2933      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2934      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2935      mov(tmp, v1, T1D, 0);
2936      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2937      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2938      mov(tmp, v1, T1D, 1);
2939      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2940      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2941
2942      add(len, len, 32);
2943  }
2944
2945  BIND(L_by16);
2946    subs(len, len, 16);
2947    br(Assembler::GE, L_by16_loop);
2948    adds(len, len, 16-4);
2949    br(Assembler::GE, L_by4_loop);
2950    adds(len, len, 4);
2951    br(Assembler::GT, L_by1_loop);
2952    b(L_exit);
2953
2954  BIND(L_by4_loop);
2955    ldrw(tmp, Address(post(buf, 4)));
2956    update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
2957    subs(len, len, 4);
2958    br(Assembler::GE, L_by4_loop);
2959    adds(len, len, 4);
2960    br(Assembler::LE, L_exit);
2961  BIND(L_by1_loop);
2962    subs(len, len, 1);
2963    ldrb(tmp, Address(post(buf, 1)));
2964    update_byte_crc32(crc, tmp, table0);
2965    br(Assembler::GT, L_by1_loop);
2966    b(L_exit);
2967
2968    align(CodeEntryAlignment);
2969  BIND(L_by16_loop);
2970    subs(len, len, 16);
2971    ldp(tmp, tmp3, Address(post(buf, 16)));
2972    update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2973    update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2974    update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
2975    update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
2976    br(Assembler::GE, L_by16_loop);
2977    adds(len, len, 16-4);
2978    br(Assembler::GE, L_by4_loop);
2979    adds(len, len, 4);
2980    br(Assembler::GT, L_by1_loop);
2981  BIND(L_exit);
2982    ornw(crc, zr, crc);
2983}
2984
2985/**
2986 * @param crc   register containing existing CRC (32-bit)
2987 * @param buf   register pointing to input byte buffer (byte*)
2988 * @param len   register containing number of bytes
2989 * @param table register that will contain address of CRC table
2990 * @param tmp   scratch register
2991 */
2992void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
2993        Register table0, Register table1, Register table2, Register table3,
2994        Register tmp, Register tmp2, Register tmp3) {
2995  Label L_exit;
2996  Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop;
2997
2998    subs(len, len, 64);
2999    br(Assembler::GE, CRC_by64_loop);
3000    adds(len, len, 64-4);
3001    br(Assembler::GE, CRC_by4_loop);
3002    adds(len, len, 4);
3003    br(Assembler::GT, CRC_by1_loop);
3004    b(L_exit);
3005
3006  BIND(CRC_by4_loop);
3007    ldrw(tmp, Address(post(buf, 4)));
3008    subs(len, len, 4);
3009    crc32cw(crc, crc, tmp);
3010    br(Assembler::GE, CRC_by4_loop);
3011    adds(len, len, 4);
3012    br(Assembler::LE, L_exit);
3013  BIND(CRC_by1_loop);
3014    ldrb(tmp, Address(post(buf, 1)));
3015    subs(len, len, 1);
3016    crc32cb(crc, crc, tmp);
3017    br(Assembler::GT, CRC_by1_loop);
3018    b(L_exit);
3019
3020    align(CodeEntryAlignment);
3021  BIND(CRC_by64_loop);
3022    subs(len, len, 64);
3023    ldp(tmp, tmp3, Address(post(buf, 16)));
3024    crc32cx(crc, crc, tmp);
3025    crc32cx(crc, crc, tmp3);
3026    ldp(tmp, tmp3, Address(post(buf, 16)));
3027    crc32cx(crc, crc, tmp);
3028    crc32cx(crc, crc, tmp3);
3029    ldp(tmp, tmp3, Address(post(buf, 16)));
3030    crc32cx(crc, crc, tmp);
3031    crc32cx(crc, crc, tmp3);
3032    ldp(tmp, tmp3, Address(post(buf, 16)));
3033    crc32cx(crc, crc, tmp);
3034    crc32cx(crc, crc, tmp3);
3035    br(Assembler::GE, CRC_by64_loop);
3036    adds(len, len, 64-4);
3037    br(Assembler::GE, CRC_by4_loop);
3038    adds(len, len, 4);
3039    br(Assembler::GT, CRC_by1_loop);
3040  BIND(L_exit);
3041    return;
3042}
3043
3044SkipIfEqual::SkipIfEqual(
3045    MacroAssembler* masm, const bool* flag_addr, bool value) {
3046  _masm = masm;
3047  unsigned long offset;
3048  _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3049  _masm->ldrb(rscratch1, Address(rscratch1, offset));
3050  _masm->cbzw(rscratch1, _label);
3051}
3052
3053SkipIfEqual::~SkipIfEqual() {
3054  _masm->bind(_label);
3055}
3056
3057void MacroAssembler::addptr(const Address &dst, int32_t src) {
3058  Address adr;
3059  switch(dst.getMode()) {
3060  case Address::base_plus_offset:
3061    // This is the expected mode, although we allow all the other
3062    // forms below.
3063    adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3064    break;
3065  default:
3066    lea(rscratch2, dst);
3067    adr = Address(rscratch2);
3068    break;
3069  }
3070  ldr(rscratch1, adr);
3071  add(rscratch1, rscratch1, src);
3072  str(rscratch1, adr);
3073}
3074
3075void MacroAssembler::cmpptr(Register src1, Address src2) {
3076  unsigned long offset;
3077  adrp(rscratch1, src2, offset);
3078  ldr(rscratch1, Address(rscratch1, offset));
3079  cmp(src1, rscratch1);
3080}
3081
3082void MacroAssembler::store_check(Register obj, Address dst) {
3083  store_check(obj);
3084}
3085
3086void MacroAssembler::store_check(Register obj) {
3087  // Does a store check for the oop in register obj. The content of
3088  // register obj is destroyed afterwards.
3089
3090  BarrierSet* bs = Universe::heap()->barrier_set();
3091  assert(bs->kind() == BarrierSet::CardTableForRS ||
3092         bs->kind() == BarrierSet::CardTableExtension,
3093         "Wrong barrier set kind");
3094
3095  CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
3096  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3097
3098  lsr(obj, obj, CardTableModRefBS::card_shift);
3099
3100  assert(CardTableModRefBS::dirty_card_val() == 0, "must be");
3101
3102  {
3103    ExternalAddress cardtable((address) ct->byte_map_base);
3104    unsigned long offset;
3105    adrp(rscratch1, cardtable, offset);
3106    assert(offset == 0, "byte_map_base is misaligned");
3107  }
3108
3109  if (UseCondCardMark) {
3110    Label L_already_dirty;
3111    membar(StoreLoad);
3112    ldrb(rscratch2,  Address(obj, rscratch1));
3113    cbz(rscratch2, L_already_dirty);
3114    strb(zr, Address(obj, rscratch1));
3115    bind(L_already_dirty);
3116  } else {
3117    if (UseConcMarkSweepGC && CMSPrecleaningEnabled) {
3118      membar(StoreStore);
3119    }
3120    strb(zr, Address(obj, rscratch1));
3121  }
3122}
3123
3124void MacroAssembler::load_klass(Register dst, Register src) {
3125  if (UseCompressedClassPointers) {
3126    ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3127    decode_klass_not_null(dst);
3128  } else {
3129    ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3130  }
3131}
3132
3133void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3134  if (UseCompressedClassPointers) {
3135    ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3136    if (Universe::narrow_klass_base() == NULL) {
3137      cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3138      return;
3139    } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3140               && Universe::narrow_klass_shift() == 0) {
3141      // Only the bottom 32 bits matter
3142      cmpw(trial_klass, tmp);
3143      return;
3144    }
3145    decode_klass_not_null(tmp);
3146  } else {
3147    ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3148  }
3149  cmp(trial_klass, tmp);
3150}
3151
3152void MacroAssembler::load_prototype_header(Register dst, Register src) {
3153  load_klass(dst, src);
3154  ldr(dst, Address(dst, Klass::prototype_header_offset()));
3155}
3156
3157void MacroAssembler::store_klass(Register dst, Register src) {
3158  // FIXME: Should this be a store release?  concurrent gcs assumes
3159  // klass length is valid if klass field is not null.
3160  if (UseCompressedClassPointers) {
3161    encode_klass_not_null(src);
3162    strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3163  } else {
3164    str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3165  }
3166}
3167
3168void MacroAssembler::store_klass_gap(Register dst, Register src) {
3169  if (UseCompressedClassPointers) {
3170    // Store to klass gap in destination
3171    strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3172  }
3173}
3174
3175// Algorithm must match oop.inline.hpp encode_heap_oop.
3176void MacroAssembler::encode_heap_oop(Register d, Register s) {
3177#ifdef ASSERT
3178  verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3179#endif
3180  verify_oop(s, "broken oop in encode_heap_oop");
3181  if (Universe::narrow_oop_base() == NULL) {
3182    if (Universe::narrow_oop_shift() != 0) {
3183      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3184      lsr(d, s, LogMinObjAlignmentInBytes);
3185    } else {
3186      mov(d, s);
3187    }
3188  } else {
3189    subs(d, s, rheapbase);
3190    csel(d, d, zr, Assembler::HS);
3191    lsr(d, d, LogMinObjAlignmentInBytes);
3192
3193    /*  Old algorithm: is this any worse?
3194    Label nonnull;
3195    cbnz(r, nonnull);
3196    sub(r, r, rheapbase);
3197    bind(nonnull);
3198    lsr(r, r, LogMinObjAlignmentInBytes);
3199    */
3200  }
3201}
3202
3203void MacroAssembler::encode_heap_oop_not_null(Register r) {
3204#ifdef ASSERT
3205  verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3206  if (CheckCompressedOops) {
3207    Label ok;
3208    cbnz(r, ok);
3209    stop("null oop passed to encode_heap_oop_not_null");
3210    bind(ok);
3211  }
3212#endif
3213  verify_oop(r, "broken oop in encode_heap_oop_not_null");
3214  if (Universe::narrow_oop_base() != NULL) {
3215    sub(r, r, rheapbase);
3216  }
3217  if (Universe::narrow_oop_shift() != 0) {
3218    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3219    lsr(r, r, LogMinObjAlignmentInBytes);
3220  }
3221}
3222
3223void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3224#ifdef ASSERT
3225  verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3226  if (CheckCompressedOops) {
3227    Label ok;
3228    cbnz(src, ok);
3229    stop("null oop passed to encode_heap_oop_not_null2");
3230    bind(ok);
3231  }
3232#endif
3233  verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3234
3235  Register data = src;
3236  if (Universe::narrow_oop_base() != NULL) {
3237    sub(dst, src, rheapbase);
3238    data = dst;
3239  }
3240  if (Universe::narrow_oop_shift() != 0) {
3241    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3242    lsr(dst, data, LogMinObjAlignmentInBytes);
3243    data = dst;
3244  }
3245  if (data == src)
3246    mov(dst, src);
3247}
3248
3249void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3250#ifdef ASSERT
3251  verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3252#endif
3253  if (Universe::narrow_oop_base() == NULL) {
3254    if (Universe::narrow_oop_shift() != 0 || d != s) {
3255      lsl(d, s, Universe::narrow_oop_shift());
3256    }
3257  } else {
3258    Label done;
3259    if (d != s)
3260      mov(d, s);
3261    cbz(s, done);
3262    add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3263    bind(done);
3264  }
3265  verify_oop(d, "broken oop in decode_heap_oop");
3266}
3267
3268void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3269  assert (UseCompressedOops, "should only be used for compressed headers");
3270  assert (Universe::heap() != NULL, "java heap should be initialized");
3271  // Cannot assert, unverified entry point counts instructions (see .ad file)
3272  // vtableStubs also counts instructions in pd_code_size_limit.
3273  // Also do not verify_oop as this is called by verify_oop.
3274  if (Universe::narrow_oop_shift() != 0) {
3275    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3276    if (Universe::narrow_oop_base() != NULL) {
3277      add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3278    } else {
3279      add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3280    }
3281  } else {
3282    assert (Universe::narrow_oop_base() == NULL, "sanity");
3283  }
3284}
3285
3286void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3287  assert (UseCompressedOops, "should only be used for compressed headers");
3288  assert (Universe::heap() != NULL, "java heap should be initialized");
3289  // Cannot assert, unverified entry point counts instructions (see .ad file)
3290  // vtableStubs also counts instructions in pd_code_size_limit.
3291  // Also do not verify_oop as this is called by verify_oop.
3292  if (Universe::narrow_oop_shift() != 0) {
3293    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3294    if (Universe::narrow_oop_base() != NULL) {
3295      add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3296    } else {
3297      add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3298    }
3299  } else {
3300    assert (Universe::narrow_oop_base() == NULL, "sanity");
3301    if (dst != src) {
3302      mov(dst, src);
3303    }
3304  }
3305}
3306
3307void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3308  if (Universe::narrow_klass_base() == NULL) {
3309    if (Universe::narrow_klass_shift() != 0) {
3310      assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3311      lsr(dst, src, LogKlassAlignmentInBytes);
3312    } else {
3313      if (dst != src) mov(dst, src);
3314    }
3315    return;
3316  }
3317
3318  if (use_XOR_for_compressed_class_base) {
3319    if (Universe::narrow_klass_shift() != 0) {
3320      eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3321      lsr(dst, dst, LogKlassAlignmentInBytes);
3322    } else {
3323      eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3324    }
3325    return;
3326  }
3327
3328  if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3329      && Universe::narrow_klass_shift() == 0) {
3330    movw(dst, src);
3331    return;
3332  }
3333
3334#ifdef ASSERT
3335  verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3336#endif
3337
3338  Register rbase = dst;
3339  if (dst == src) rbase = rheapbase;
3340  mov(rbase, (uint64_t)Universe::narrow_klass_base());
3341  sub(dst, src, rbase);
3342  if (Universe::narrow_klass_shift() != 0) {
3343    assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3344    lsr(dst, dst, LogKlassAlignmentInBytes);
3345  }
3346  if (dst == src) reinit_heapbase();
3347}
3348
3349void MacroAssembler::encode_klass_not_null(Register r) {
3350  encode_klass_not_null(r, r);
3351}
3352
3353void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3354  Register rbase = dst;
3355  assert (UseCompressedClassPointers, "should only be used for compressed headers");
3356
3357  if (Universe::narrow_klass_base() == NULL) {
3358    if (Universe::narrow_klass_shift() != 0) {
3359      assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3360      lsl(dst, src, LogKlassAlignmentInBytes);
3361    } else {
3362      if (dst != src) mov(dst, src);
3363    }
3364    return;
3365  }
3366
3367  if (use_XOR_for_compressed_class_base) {
3368    if (Universe::narrow_klass_shift() != 0) {
3369      lsl(dst, src, LogKlassAlignmentInBytes);
3370      eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3371    } else {
3372      eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3373    }
3374    return;
3375  }
3376
3377  if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3378      && Universe::narrow_klass_shift() == 0) {
3379    if (dst != src)
3380      movw(dst, src);
3381    movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3382    return;
3383  }
3384
3385  // Cannot assert, unverified entry point counts instructions (see .ad file)
3386  // vtableStubs also counts instructions in pd_code_size_limit.
3387  // Also do not verify_oop as this is called by verify_oop.
3388  if (dst == src) rbase = rheapbase;
3389  mov(rbase, (uint64_t)Universe::narrow_klass_base());
3390  if (Universe::narrow_klass_shift() != 0) {
3391    assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3392    add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3393  } else {
3394    add(dst, rbase, src);
3395  }
3396  if (dst == src) reinit_heapbase();
3397}
3398
3399void  MacroAssembler::decode_klass_not_null(Register r) {
3400  decode_klass_not_null(r, r);
3401}
3402
3403void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3404  assert (UseCompressedOops, "should only be used for compressed oops");
3405  assert (Universe::heap() != NULL, "java heap should be initialized");
3406  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3407
3408  int oop_index = oop_recorder()->find_index(obj);
3409  assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3410
3411  InstructionMark im(this);
3412  RelocationHolder rspec = oop_Relocation::spec(oop_index);
3413  code_section()->relocate(inst_mark(), rspec);
3414  movz(dst, 0xDEAD, 16);
3415  movk(dst, 0xBEEF);
3416}
3417
3418void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3419  assert (UseCompressedClassPointers, "should only be used for compressed headers");
3420  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3421  int index = oop_recorder()->find_index(k);
3422  assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3423
3424  InstructionMark im(this);
3425  RelocationHolder rspec = metadata_Relocation::spec(index);
3426  code_section()->relocate(inst_mark(), rspec);
3427  narrowKlass nk = Klass::encode_klass(k);
3428  movz(dst, (nk >> 16), 16);
3429  movk(dst, nk & 0xffff);
3430}
3431
3432void MacroAssembler::load_heap_oop(Register dst, Address src)
3433{
3434  if (UseCompressedOops) {
3435    ldrw(dst, src);
3436    decode_heap_oop(dst);
3437  } else {
3438    ldr(dst, src);
3439  }
3440}
3441
3442void MacroAssembler::load_heap_oop_not_null(Register dst, Address src)
3443{
3444  if (UseCompressedOops) {
3445    ldrw(dst, src);
3446    decode_heap_oop_not_null(dst);
3447  } else {
3448    ldr(dst, src);
3449  }
3450}
3451
3452void MacroAssembler::store_heap_oop(Address dst, Register src) {
3453  if (UseCompressedOops) {
3454    assert(!dst.uses(src), "not enough registers");
3455    encode_heap_oop(src);
3456    strw(src, dst);
3457  } else
3458    str(src, dst);
3459}
3460
3461// Used for storing NULLs.
3462void MacroAssembler::store_heap_oop_null(Address dst) {
3463  if (UseCompressedOops) {
3464    strw(zr, dst);
3465  } else
3466    str(zr, dst);
3467}
3468
3469#if INCLUDE_ALL_GCS
3470void MacroAssembler::g1_write_barrier_pre(Register obj,
3471                                          Register pre_val,
3472                                          Register thread,
3473                                          Register tmp,
3474                                          bool tosca_live,
3475                                          bool expand_call) {
3476  // If expand_call is true then we expand the call_VM_leaf macro
3477  // directly to skip generating the check by
3478  // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
3479
3480  assert(thread == rthread, "must be");
3481
3482  Label done;
3483  Label runtime;
3484
3485  assert(pre_val != noreg, "check this code");
3486
3487  if (obj != noreg)
3488    assert_different_registers(obj, pre_val, tmp);
3489
3490  Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3491                                       SATBMarkQueue::byte_offset_of_active()));
3492  Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3493                                       SATBMarkQueue::byte_offset_of_index()));
3494  Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3495                                       SATBMarkQueue::byte_offset_of_buf()));
3496
3497
3498  // Is marking active?
3499  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
3500    ldrw(tmp, in_progress);
3501  } else {
3502    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
3503    ldrb(tmp, in_progress);
3504  }
3505  cbzw(tmp, done);
3506
3507  // Do we need to load the previous value?
3508  if (obj != noreg) {
3509    load_heap_oop(pre_val, Address(obj, 0));
3510  }
3511
3512  // Is the previous value null?
3513  cbz(pre_val, done);
3514
3515  // Can we store original value in the thread's buffer?
3516  // Is index == 0?
3517  // (The index field is typed as size_t.)
3518
3519  ldr(tmp, index);                      // tmp := *index_adr
3520  cbz(tmp, runtime);                    // tmp == 0?
3521                                        // If yes, goto runtime
3522
3523  sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
3524  str(tmp, index);                      // *index_adr := tmp
3525  ldr(rscratch1, buffer);
3526  add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr
3527
3528  // Record the previous value
3529  str(pre_val, Address(tmp, 0));
3530  b(done);
3531
3532  bind(runtime);
3533  // save the live input values
3534  push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3535
3536  // Calling the runtime using the regular call_VM_leaf mechanism generates
3537  // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
3538  // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL.
3539  //
3540  // If we care generating the pre-barrier without a frame (e.g. in the
3541  // intrinsified Reference.get() routine) then ebp might be pointing to
3542  // the caller frame and so this check will most likely fail at runtime.
3543  //
3544  // Expanding the call directly bypasses the generation of the check.
3545  // So when we do not have have a full interpreter frame on the stack
3546  // expand_call should be passed true.
3547
3548  if (expand_call) {
3549    assert(pre_val != c_rarg1, "smashed arg");
3550    pass_arg1(this, thread);
3551    pass_arg0(this, pre_val);
3552    MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
3553  } else {
3554    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
3555  }
3556
3557  pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3558
3559  bind(done);
3560}
3561
3562void MacroAssembler::g1_write_barrier_post(Register store_addr,
3563                                           Register new_val,
3564                                           Register thread,
3565                                           Register tmp,
3566                                           Register tmp2) {
3567  assert(thread == rthread, "must be");
3568
3569  Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3570                                       DirtyCardQueue::byte_offset_of_index()));
3571  Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3572                                       DirtyCardQueue::byte_offset_of_buf()));
3573
3574  BarrierSet* bs = Universe::heap()->barrier_set();
3575  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
3576  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3577
3578  Label done;
3579  Label runtime;
3580
3581  // Does store cross heap regions?
3582
3583  eor(tmp, store_addr, new_val);
3584  lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
3585  cbz(tmp, done);
3586
3587  // crosses regions, storing NULL?
3588
3589  cbz(new_val, done);
3590
3591  // storing region crossing non-NULL, is card already dirty?
3592
3593  ExternalAddress cardtable((address) ct->byte_map_base);
3594  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3595  const Register card_addr = tmp;
3596
3597  lsr(card_addr, store_addr, CardTableModRefBS::card_shift);
3598
3599  unsigned long offset;
3600  adrp(tmp2, cardtable, offset);
3601
3602  // get the address of the card
3603  add(card_addr, card_addr, tmp2);
3604  ldrb(tmp2, Address(card_addr, offset));
3605  cmpw(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
3606  br(Assembler::EQ, done);
3607
3608  assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0");
3609
3610  membar(Assembler::StoreLoad);
3611
3612  ldrb(tmp2, Address(card_addr, offset));
3613  cbzw(tmp2, done);
3614
3615  // storing a region crossing, non-NULL oop, card is clean.
3616  // dirty card and log.
3617
3618  strb(zr, Address(card_addr, offset));
3619
3620  ldr(rscratch1, queue_index);
3621  cbz(rscratch1, runtime);
3622  sub(rscratch1, rscratch1, wordSize);
3623  str(rscratch1, queue_index);
3624
3625  ldr(tmp2, buffer);
3626  str(card_addr, Address(tmp2, rscratch1));
3627  b(done);
3628
3629  bind(runtime);
3630  // save the live input values
3631  push(store_addr->bit(true) | new_val->bit(true), sp);
3632  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
3633  pop(store_addr->bit(true) | new_val->bit(true), sp);
3634
3635  bind(done);
3636}
3637
3638#endif // INCLUDE_ALL_GCS
3639
3640Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
3641  assert(oop_recorder() != NULL, "this assembler needs a Recorder");
3642  int index = oop_recorder()->allocate_metadata_index(obj);
3643  RelocationHolder rspec = metadata_Relocation::spec(index);
3644  return Address((address)obj, rspec);
3645}
3646
3647// Move an oop into a register.  immediate is true if we want
3648// immediate instrcutions, i.e. we are not going to patch this
3649// instruction while the code is being executed by another thread.  In
3650// that case we can use move immediates rather than the constant pool.
3651void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
3652  int oop_index;
3653  if (obj == NULL) {
3654    oop_index = oop_recorder()->allocate_oop_index(obj);
3655  } else {
3656    oop_index = oop_recorder()->find_index(obj);
3657    assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3658  }
3659  RelocationHolder rspec = oop_Relocation::spec(oop_index);
3660  if (! immediate) {
3661    address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3662    ldr_constant(dst, Address(dummy, rspec));
3663  } else
3664    mov(dst, Address((address)obj, rspec));
3665}
3666
3667// Move a metadata address into a register.
3668void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3669  int oop_index;
3670  if (obj == NULL) {
3671    oop_index = oop_recorder()->allocate_metadata_index(obj);
3672  } else {
3673    oop_index = oop_recorder()->find_index(obj);
3674  }
3675  RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3676  mov(dst, Address((address)obj, rspec));
3677}
3678
3679Address MacroAssembler::constant_oop_address(jobject obj) {
3680  assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
3681  assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
3682  int oop_index = oop_recorder()->find_index(obj);
3683  return Address((address)obj, oop_Relocation::spec(oop_index));
3684}
3685
3686// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3687void MacroAssembler::tlab_allocate(Register obj,
3688                                   Register var_size_in_bytes,
3689                                   int con_size_in_bytes,
3690                                   Register t1,
3691                                   Register t2,
3692                                   Label& slow_case) {
3693  assert_different_registers(obj, t2);
3694  assert_different_registers(obj, var_size_in_bytes);
3695  Register end = t2;
3696
3697  // verify_tlab();
3698
3699  ldr(obj, Address(rthread, JavaThread::tlab_top_offset()));
3700  if (var_size_in_bytes == noreg) {
3701    lea(end, Address(obj, con_size_in_bytes));
3702  } else {
3703    lea(end, Address(obj, var_size_in_bytes));
3704  }
3705  ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset()));
3706  cmp(end, rscratch1);
3707  br(Assembler::HI, slow_case);
3708
3709  // update the tlab top pointer
3710  str(end, Address(rthread, JavaThread::tlab_top_offset()));
3711
3712  // recover var_size_in_bytes if necessary
3713  if (var_size_in_bytes == end) {
3714    sub(var_size_in_bytes, var_size_in_bytes, obj);
3715  }
3716  // verify_tlab();
3717}
3718
3719// Preserves r19, and r3.
3720Register MacroAssembler::tlab_refill(Label& retry,
3721                                     Label& try_eden,
3722                                     Label& slow_case) {
3723  Register top = r0;
3724  Register t1  = r2;
3725  Register t2  = r4;
3726  assert_different_registers(top, rthread, t1, t2, /* preserve: */ r19, r3);
3727  Label do_refill, discard_tlab;
3728
3729  if (!Universe::heap()->supports_inline_contig_alloc()) {
3730    // No allocation in the shared eden.
3731    b(slow_case);
3732  }
3733
3734  ldr(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3735  ldr(t1,  Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3736
3737  // calculate amount of free space
3738  sub(t1, t1, top);
3739  lsr(t1, t1, LogHeapWordSize);
3740
3741  // Retain tlab and allocate object in shared space if
3742  // the amount free in the tlab is too large to discard.
3743
3744  ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3745  cmp(t1, rscratch1);
3746  br(Assembler::LE, discard_tlab);
3747
3748  // Retain
3749  // ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3750  mov(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
3751  add(rscratch1, rscratch1, t2);
3752  str(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3753
3754  if (TLABStats) {
3755    // increment number of slow_allocations
3756    addmw(Address(rthread, in_bytes(JavaThread::tlab_slow_allocations_offset())),
3757         1, rscratch1);
3758  }
3759  b(try_eden);
3760
3761  bind(discard_tlab);
3762  if (TLABStats) {
3763    // increment number of refills
3764    addmw(Address(rthread, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1,
3765         rscratch1);
3766    // accumulate wastage -- t1 is amount free in tlab
3767    addmw(Address(rthread, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1,
3768         rscratch1);
3769  }
3770
3771  // if tlab is currently allocated (top or end != null) then
3772  // fill [top, end + alignment_reserve) with array object
3773  cbz(top, do_refill);
3774
3775  // set up the mark word
3776  mov(rscratch1, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
3777  str(rscratch1, Address(top, oopDesc::mark_offset_in_bytes()));
3778  // set the length to the remaining space
3779  sub(t1, t1, typeArrayOopDesc::header_size(T_INT));
3780  add(t1, t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
3781  lsl(t1, t1, log2_intptr(HeapWordSize/sizeof(jint)));
3782  strw(t1, Address(top, arrayOopDesc::length_offset_in_bytes()));
3783  // set klass to intArrayKlass
3784  {
3785    unsigned long offset;
3786    // dubious reloc why not an oop reloc?
3787    adrp(rscratch1, ExternalAddress((address)Universe::intArrayKlassObj_addr()),
3788         offset);
3789    ldr(t1, Address(rscratch1, offset));
3790  }
3791  // store klass last.  concurrent gcs assumes klass length is valid if
3792  // klass field is not null.
3793  store_klass(top, t1);
3794
3795  mov(t1, top);
3796  ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
3797  sub(t1, t1, rscratch1);
3798  incr_allocated_bytes(rthread, t1, 0, rscratch1);
3799
3800  // refill the tlab with an eden allocation
3801  bind(do_refill);
3802  ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
3803  lsl(t1, t1, LogHeapWordSize);
3804  // allocate new tlab, address returned in top
3805  eden_allocate(top, t1, 0, t2, slow_case);
3806
3807  // Check that t1 was preserved in eden_allocate.
3808#ifdef ASSERT
3809  if (UseTLAB) {
3810    Label ok;
3811    Register tsize = r4;
3812    assert_different_registers(tsize, rthread, t1);
3813    str(tsize, Address(pre(sp, -16)));
3814    ldr(tsize, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
3815    lsl(tsize, tsize, LogHeapWordSize);
3816    cmp(t1, tsize);
3817    br(Assembler::EQ, ok);
3818    STOP("assert(t1 != tlab size)");
3819    should_not_reach_here();
3820
3821    bind(ok);
3822    ldr(tsize, Address(post(sp, 16)));
3823  }
3824#endif
3825  str(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
3826  str(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3827  add(top, top, t1);
3828  sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
3829  str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3830  verify_tlab();
3831  b(retry);
3832
3833  return rthread; // for use by caller
3834}
3835
3836// Defines obj, preserves var_size_in_bytes
3837void MacroAssembler::eden_allocate(Register obj,
3838                                   Register var_size_in_bytes,
3839                                   int con_size_in_bytes,
3840                                   Register t1,
3841                                   Label& slow_case) {
3842  assert_different_registers(obj, var_size_in_bytes, t1);
3843  if (!Universe::heap()->supports_inline_contig_alloc()) {
3844    b(slow_case);
3845  } else {
3846    Register end = t1;
3847    Register heap_end = rscratch2;
3848    Label retry;
3849    bind(retry);
3850    {
3851      unsigned long offset;
3852      adrp(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()), offset);
3853      ldr(heap_end, Address(rscratch1, offset));
3854    }
3855
3856    ExternalAddress heap_top((address) Universe::heap()->top_addr());
3857
3858    // Get the current top of the heap
3859    {
3860      unsigned long offset;
3861      adrp(rscratch1, heap_top, offset);
3862      // Use add() here after ARDP, rather than lea().
3863      // lea() does not generate anything if its offset is zero.
3864      // However, relocs expect to find either an ADD or a load/store
3865      // insn after an ADRP.  add() always generates an ADD insn, even
3866      // for add(Rn, Rn, 0).
3867      add(rscratch1, rscratch1, offset);
3868      ldaxr(obj, rscratch1);
3869    }
3870
3871    // Adjust it my the size of our new object
3872    if (var_size_in_bytes == noreg) {
3873      lea(end, Address(obj, con_size_in_bytes));
3874    } else {
3875      lea(end, Address(obj, var_size_in_bytes));
3876    }
3877
3878    // if end < obj then we wrapped around high memory
3879    cmp(end, obj);
3880    br(Assembler::LO, slow_case);
3881
3882    cmp(end, heap_end);
3883    br(Assembler::HI, slow_case);
3884
3885    // If heap_top hasn't been changed by some other thread, update it.
3886    stlxr(rscratch2, end, rscratch1);
3887    cbnzw(rscratch2, retry);
3888  }
3889}
3890
3891void MacroAssembler::verify_tlab() {
3892#ifdef ASSERT
3893  if (UseTLAB && VerifyOops) {
3894    Label next, ok;
3895
3896    stp(rscratch2, rscratch1, Address(pre(sp, -16)));
3897
3898    ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3899    ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
3900    cmp(rscratch2, rscratch1);
3901    br(Assembler::HS, next);
3902    STOP("assert(top >= start)");
3903    should_not_reach_here();
3904
3905    bind(next);
3906    ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3907    ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3908    cmp(rscratch2, rscratch1);
3909    br(Assembler::HS, ok);
3910    STOP("assert(top <= end)");
3911    should_not_reach_here();
3912
3913    bind(ok);
3914    ldp(rscratch2, rscratch1, Address(post(sp, 16)));
3915  }
3916#endif
3917}
3918
3919// Writes to stack successive pages until offset reached to check for
3920// stack overflow + shadow pages.  This clobbers tmp.
3921void MacroAssembler::bang_stack_size(Register size, Register tmp) {
3922  assert_different_registers(tmp, size, rscratch1);
3923  mov(tmp, sp);
3924  // Bang stack for total size given plus shadow page size.
3925  // Bang one page at a time because large size can bang beyond yellow and
3926  // red zones.
3927  Label loop;
3928  mov(rscratch1, os::vm_page_size());
3929  bind(loop);
3930  lea(tmp, Address(tmp, -os::vm_page_size()));
3931  subsw(size, size, rscratch1);
3932  str(size, Address(tmp));
3933  br(Assembler::GT, loop);
3934
3935  // Bang down shadow pages too.
3936  // At this point, (tmp-0) is the last address touched, so don't
3937  // touch it again.  (It was touched as (tmp-pagesize) but then tmp
3938  // was post-decremented.)  Skip this address by starting at i=1, and
3939  // touch a few more pages below.  N.B.  It is important to touch all
3940  // the way down to and including i=StackShadowPages.
3941  for (int i = 0; i< StackShadowPages-1; i++) {
3942    // this could be any sized move but this is can be a debugging crumb
3943    // so the bigger the better.
3944    lea(tmp, Address(tmp, -os::vm_page_size()));
3945    str(size, Address(tmp));
3946  }
3947}
3948
3949
3950address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
3951  unsigned long off;
3952  adrp(r, Address(page, rtype), off);
3953  InstructionMark im(this);
3954  code_section()->relocate(inst_mark(), rtype);
3955  ldrw(zr, Address(r, off));
3956  return inst_mark();
3957}
3958
3959address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
3960  InstructionMark im(this);
3961  code_section()->relocate(inst_mark(), rtype);
3962  ldrw(zr, Address(r, 0));
3963  return inst_mark();
3964}
3965
3966void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
3967  relocInfo::relocType rtype = dest.rspec().reloc()->type();
3968  unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
3969  unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
3970  unsigned long dest_page = (unsigned long)dest.target() >> 12;
3971  long offset_low = dest_page - low_page;
3972  long offset_high = dest_page - high_page;
3973
3974  InstructionMark im(this);
3975  code_section()->relocate(inst_mark(), dest.rspec());
3976  // 8143067: Ensure that the adrp can reach the dest from anywhere within
3977  // the code cache so that if it is relocated we know it will still reach
3978  if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
3979    _adrp(reg1, dest.target());
3980  } else {
3981    unsigned long pc_page = (unsigned long)pc() >> 12;
3982    long offset = dest_page - pc_page;
3983    offset = (offset & ((1<<20)-1)) << 12;
3984    _adrp(reg1, pc()+offset);
3985    movk(reg1, ((unsigned long)dest.target() >> 32) & 0xffff, 32);
3986  }
3987  byte_offset = (unsigned long)dest.target() & 0xfff;
3988}
3989
3990void MacroAssembler::build_frame(int framesize) {
3991  assert(framesize > 0, "framesize must be > 0");
3992  if (framesize < ((1 << 9) + 2 * wordSize)) {
3993    sub(sp, sp, framesize);
3994    stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
3995    if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
3996  } else {
3997    stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
3998    if (PreserveFramePointer) mov(rfp, sp);
3999    if (framesize < ((1 << 12) + 2 * wordSize))
4000      sub(sp, sp, framesize - 2 * wordSize);
4001    else {
4002      mov(rscratch1, framesize - 2 * wordSize);
4003      sub(sp, sp, rscratch1);
4004    }
4005  }
4006}
4007
4008void MacroAssembler::remove_frame(int framesize) {
4009  assert(framesize > 0, "framesize must be > 0");
4010  if (framesize < ((1 << 9) + 2 * wordSize)) {
4011    ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4012    add(sp, sp, framesize);
4013  } else {
4014    if (framesize < ((1 << 12) + 2 * wordSize))
4015      add(sp, sp, framesize - 2 * wordSize);
4016    else {
4017      mov(rscratch1, framesize - 2 * wordSize);
4018      add(sp, sp, rscratch1);
4019    }
4020    ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4021  }
4022}
4023
4024
4025// Search for str1 in str2 and return index or -1
4026void MacroAssembler::string_indexof(Register str2, Register str1,
4027                                    Register cnt2, Register cnt1,
4028                                    Register tmp1, Register tmp2,
4029                                    Register tmp3, Register tmp4,
4030                                    int icnt1, Register result) {
4031  Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH;
4032
4033  Register ch1 = rscratch1;
4034  Register ch2 = rscratch2;
4035  Register cnt1tmp = tmp1;
4036  Register cnt2tmp = tmp2;
4037  Register cnt1_neg = cnt1;
4038  Register cnt2_neg = cnt2;
4039  Register result_tmp = tmp4;
4040
4041  // Note, inline_string_indexOf() generates checks:
4042  // if (substr.count > string.count) return -1;
4043  // if (substr.count == 0) return 0;
4044
4045// We have two strings, a source string in str2, cnt2 and a pattern string
4046// in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4047
4048// For larger pattern and source we use a simplified Boyer Moore algorithm.
4049// With a small pattern and source we use linear scan.
4050
4051  if (icnt1 == -1) {
4052    cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4053    ccmp(cnt1, 8, 0b0000, LO);  // Can't handle skip >= 256 because we use
4054    br(LO, LINEARSEARCH);       // a byte array.
4055    cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
4056    br(HS, LINEARSEARCH);
4057  }
4058
4059// The Boyer Moore alogorithm is based on the description here:-
4060//
4061// http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4062//
4063// This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4064// and the 'Good Suffix' rule.
4065//
4066// These rules are essentially heuristics for how far we can shift the
4067// pattern along the search string.
4068//
4069// The implementation here uses the 'Bad Character' rule only because of the
4070// complexity of initialisation for the 'Good Suffix' rule.
4071//
4072// This is also known as the Boyer-Moore-Horspool algorithm:-
4073//
4074// http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4075//
4076// #define ASIZE 128
4077//
4078//    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4079//       int i, j;
4080//       unsigned c;
4081//       unsigned char bc[ASIZE];
4082//
4083//       /* Preprocessing */
4084//       for (i = 0; i < ASIZE; ++i)
4085//          bc[i] = 0;
4086//       for (i = 0; i < m - 1; ) {
4087//          c = x[i];
4088//          ++i;
4089//          if (c < ASIZE) bc[c] = i;
4090//       }
4091//
4092//       /* Searching */
4093//       j = 0;
4094//       while (j <= n - m) {
4095//          c = y[i+j];
4096//          if (x[m-1] == c)
4097//            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4098//          if (i < 0) return j;
4099//          if (c < ASIZE)
4100//            j = j - bc[y[j+m-1]] + m;
4101//          else
4102//            j += 1; // Advance by 1 only if char >= ASIZE
4103//       }
4104//    }
4105
4106  if (icnt1 == -1) {
4107    BIND(BM);
4108
4109    Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
4110    Label BMADV, BMMATCH, BMCHECKEND;
4111
4112    Register cnt1end = tmp2;
4113    Register str2end = cnt2;
4114    Register skipch = tmp2;
4115
4116    // Restrict ASIZE to 128 to reduce stack space/initialisation.
4117    // The presence of chars >= ASIZE in the target string does not affect
4118    // performance, but we must be careful not to initialise them in the stack
4119    // array.
4120    // The presence of chars >= ASIZE in the source string may adversely affect
4121    // performance since we can only advance by one when we encounter one.
4122
4123      stp(zr, zr, pre(sp, -128));
4124      for (int i = 1; i < 8; i++)
4125          stp(zr, zr, Address(sp, i*16));
4126
4127      mov(cnt1tmp, 0);
4128      sub(cnt1end, cnt1, 1);
4129    BIND(BCLOOP);
4130      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
4131      cmp(ch1, 128);
4132      add(cnt1tmp, cnt1tmp, 1);
4133      br(HS, BCSKIP);
4134      strb(cnt1tmp, Address(sp, ch1));
4135    BIND(BCSKIP);
4136      cmp(cnt1tmp, cnt1end);
4137      br(LT, BCLOOP);
4138
4139      mov(result_tmp, str2);
4140
4141      sub(cnt2, cnt2, cnt1);
4142      add(str2end, str2, cnt2, LSL, 1);
4143    BIND(BMLOOPSTR2);
4144      sub(cnt1tmp, cnt1, 1);
4145      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
4146      ldrh(skipch, Address(str2, cnt1tmp, Address::lsl(1)));
4147      cmp(ch1, skipch);
4148      br(NE, BMSKIP);
4149      subs(cnt1tmp, cnt1tmp, 1);
4150      br(LT, BMMATCH);
4151    BIND(BMLOOPSTR1);
4152      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
4153      ldrh(ch2, Address(str2, cnt1tmp, Address::lsl(1)));
4154      cmp(ch1, ch2);
4155      br(NE, BMSKIP);
4156      subs(cnt1tmp, cnt1tmp, 1);
4157      br(GE, BMLOOPSTR1);
4158    BIND(BMMATCH);
4159      sub(result_tmp, str2, result_tmp);
4160      lsr(result, result_tmp, 1);
4161      add(sp, sp, 128);
4162      b(DONE);
4163    BIND(BMADV);
4164      add(str2, str2, 2);
4165      b(BMCHECKEND);
4166    BIND(BMSKIP);
4167      cmp(skipch, 128);
4168      br(HS, BMADV);
4169      ldrb(ch2, Address(sp, skipch));
4170      add(str2, str2, cnt1, LSL, 1);
4171      sub(str2, str2, ch2, LSL, 1);
4172    BIND(BMCHECKEND);
4173      cmp(str2, str2end);
4174      br(LE, BMLOOPSTR2);
4175      add(sp, sp, 128);
4176      b(NOMATCH);
4177  }
4178
4179  BIND(LINEARSEARCH);
4180  {
4181    Label DO1, DO2, DO3;
4182
4183    Register str2tmp = tmp2;
4184    Register first = tmp3;
4185
4186    if (icnt1 == -1)
4187    {
4188        Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT, LAST_WORD;
4189
4190        cmp(cnt1, 4);
4191        br(LT, DOSHORT);
4192
4193        sub(cnt2, cnt2, cnt1);
4194        sub(cnt1, cnt1, 4);
4195        mov(result_tmp, cnt2);
4196
4197        lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4198        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4199        sub(cnt1_neg, zr, cnt1, LSL, 1);
4200        sub(cnt2_neg, zr, cnt2, LSL, 1);
4201        ldr(first, Address(str1, cnt1_neg));
4202
4203      BIND(FIRST_LOOP);
4204        ldr(ch2, Address(str2, cnt2_neg));
4205        cmp(first, ch2);
4206        br(EQ, STR1_LOOP);
4207      BIND(STR2_NEXT);
4208        adds(cnt2_neg, cnt2_neg, 2);
4209        br(LE, FIRST_LOOP);
4210        b(NOMATCH);
4211
4212      BIND(STR1_LOOP);
4213        adds(cnt1tmp, cnt1_neg, 8);
4214        add(cnt2tmp, cnt2_neg, 8);
4215        br(GE, LAST_WORD);
4216
4217      BIND(STR1_NEXT);
4218        ldr(ch1, Address(str1, cnt1tmp));
4219        ldr(ch2, Address(str2, cnt2tmp));
4220        cmp(ch1, ch2);
4221        br(NE, STR2_NEXT);
4222        adds(cnt1tmp, cnt1tmp, 8);
4223        add(cnt2tmp, cnt2tmp, 8);
4224        br(LT, STR1_NEXT);
4225
4226      BIND(LAST_WORD);
4227        ldr(ch1, Address(str1));
4228        sub(str2tmp, str2, cnt1_neg);         // adjust to corresponding
4229        ldr(ch2, Address(str2tmp, cnt2_neg)); // word in str2
4230        cmp(ch1, ch2);
4231        br(NE, STR2_NEXT);
4232        b(MATCH);
4233
4234      BIND(DOSHORT);
4235        cmp(cnt1, 2);
4236        br(LT, DO1);
4237        br(GT, DO3);
4238    }
4239
4240    if (icnt1 == 4) {
4241      Label CH1_LOOP;
4242
4243        ldr(ch1, str1);
4244        sub(cnt2, cnt2, 4);
4245        mov(result_tmp, cnt2);
4246        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4247        sub(cnt2_neg, zr, cnt2, LSL, 1);
4248
4249      BIND(CH1_LOOP);
4250        ldr(ch2, Address(str2, cnt2_neg));
4251        cmp(ch1, ch2);
4252        br(EQ, MATCH);
4253        adds(cnt2_neg, cnt2_neg, 2);
4254        br(LE, CH1_LOOP);
4255        b(NOMATCH);
4256    }
4257
4258    if (icnt1 == -1 || icnt1 == 2) {
4259      Label CH1_LOOP;
4260
4261      BIND(DO2);
4262        ldrw(ch1, str1);
4263        sub(cnt2, cnt2, 2);
4264        mov(result_tmp, cnt2);
4265        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4266        sub(cnt2_neg, zr, cnt2, LSL, 1);
4267
4268      BIND(CH1_LOOP);
4269        ldrw(ch2, Address(str2, cnt2_neg));
4270        cmp(ch1, ch2);
4271        br(EQ, MATCH);
4272        adds(cnt2_neg, cnt2_neg, 2);
4273        br(LE, CH1_LOOP);
4274        b(NOMATCH);
4275    }
4276
4277    if (icnt1 == -1 || icnt1 == 3) {
4278      Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4279
4280      BIND(DO3);
4281        ldrw(first, str1);
4282        ldrh(ch1, Address(str1, 4));
4283
4284        sub(cnt2, cnt2, 3);
4285        mov(result_tmp, cnt2);
4286        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4287        sub(cnt2_neg, zr, cnt2, LSL, 1);
4288
4289      BIND(FIRST_LOOP);
4290        ldrw(ch2, Address(str2, cnt2_neg));
4291        cmpw(first, ch2);
4292        br(EQ, STR1_LOOP);
4293      BIND(STR2_NEXT);
4294        adds(cnt2_neg, cnt2_neg, 2);
4295        br(LE, FIRST_LOOP);
4296        b(NOMATCH);
4297
4298      BIND(STR1_LOOP);
4299        add(cnt2tmp, cnt2_neg, 4);
4300        ldrh(ch2, Address(str2, cnt2tmp));
4301        cmp(ch1, ch2);
4302        br(NE, STR2_NEXT);
4303        b(MATCH);
4304    }
4305
4306    if (icnt1 == -1 || icnt1 == 1) {
4307      Label CH1_LOOP, HAS_ZERO;
4308      Label DO1_SHORT, DO1_LOOP;
4309
4310      BIND(DO1);
4311        ldrh(ch1, str1);
4312        cmp(cnt2, 4);
4313        br(LT, DO1_SHORT);
4314
4315        orr(ch1, ch1, ch1, LSL, 16);
4316        orr(ch1, ch1, ch1, LSL, 32);
4317
4318        sub(cnt2, cnt2, 4);
4319        mov(result_tmp, cnt2);
4320        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4321        sub(cnt2_neg, zr, cnt2, LSL, 1);
4322
4323        mov(tmp3, 0x0001000100010001);
4324      BIND(CH1_LOOP);
4325        ldr(ch2, Address(str2, cnt2_neg));
4326        eor(ch2, ch1, ch2);
4327        sub(tmp1, ch2, tmp3);
4328        orr(tmp2, ch2, 0x7fff7fff7fff7fff);
4329        bics(tmp1, tmp1, tmp2);
4330        br(NE, HAS_ZERO);
4331        adds(cnt2_neg, cnt2_neg, 8);
4332        br(LT, CH1_LOOP);
4333
4334        cmp(cnt2_neg, 8);
4335        mov(cnt2_neg, 0);
4336        br(LT, CH1_LOOP);
4337        b(NOMATCH);
4338
4339      BIND(HAS_ZERO);
4340        rev(tmp1, tmp1);
4341        clz(tmp1, tmp1);
4342        add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4343        b(MATCH);
4344
4345      BIND(DO1_SHORT);
4346        mov(result_tmp, cnt2);
4347        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4348        sub(cnt2_neg, zr, cnt2, LSL, 1);
4349      BIND(DO1_LOOP);
4350        ldrh(ch2, Address(str2, cnt2_neg));
4351        cmpw(ch1, ch2);
4352        br(EQ, MATCH);
4353        adds(cnt2_neg, cnt2_neg, 2);
4354        br(LT, DO1_LOOP);
4355    }
4356  }
4357  BIND(NOMATCH);
4358    mov(result, -1);
4359    b(DONE);
4360  BIND(MATCH);
4361    add(result, result_tmp, cnt2_neg, ASR, 1);
4362  BIND(DONE);
4363}
4364
4365// Compare strings.
4366void MacroAssembler::string_compare(Register str1, Register str2,
4367                                    Register cnt1, Register cnt2, Register result,
4368                                    Register tmp1) {
4369  Label LENGTH_DIFF, DONE, SHORT_LOOP, SHORT_STRING,
4370    NEXT_WORD, DIFFERENCE;
4371
4372  BLOCK_COMMENT("string_compare {");
4373
4374  // Compute the minimum of the string lengths and save the difference.
4375  subsw(tmp1, cnt1, cnt2);
4376  cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4377
4378  // A very short string
4379  cmpw(cnt2, 4);
4380  br(Assembler::LT, SHORT_STRING);
4381
4382  // Check if the strings start at the same location.
4383  cmp(str1, str2);
4384  br(Assembler::EQ, LENGTH_DIFF);
4385
4386  // Compare longwords
4387  {
4388    subw(cnt2, cnt2, 4); // The last longword is a special case
4389
4390    // Move both string pointers to the last longword of their
4391    // strings, negate the remaining count, and convert it to bytes.
4392    lea(str1, Address(str1, cnt2, Address::uxtw(1)));
4393    lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4394    sub(cnt2, zr, cnt2, LSL, 1);
4395
4396    // Loop, loading longwords and comparing them into rscratch2.
4397    bind(NEXT_WORD);
4398    ldr(result, Address(str1, cnt2));
4399    ldr(cnt1, Address(str2, cnt2));
4400    adds(cnt2, cnt2, wordSize);
4401    eor(rscratch2, result, cnt1);
4402    cbnz(rscratch2, DIFFERENCE);
4403    br(Assembler::LT, NEXT_WORD);
4404
4405    // Last longword.  In the case where length == 4 we compare the
4406    // same longword twice, but that's still faster than another
4407    // conditional branch.
4408
4409    ldr(result, Address(str1));
4410    ldr(cnt1, Address(str2));
4411    eor(rscratch2, result, cnt1);
4412    cbz(rscratch2, LENGTH_DIFF);
4413
4414    // Find the first different characters in the longwords and
4415    // compute their difference.
4416    bind(DIFFERENCE);
4417    rev(rscratch2, rscratch2);
4418    clz(rscratch2, rscratch2);
4419    andr(rscratch2, rscratch2, -16);
4420    lsrv(result, result, rscratch2);
4421    uxthw(result, result);
4422    lsrv(cnt1, cnt1, rscratch2);
4423    uxthw(cnt1, cnt1);
4424    subw(result, result, cnt1);
4425    b(DONE);
4426  }
4427
4428  bind(SHORT_STRING);
4429  // Is the minimum length zero?
4430  cbz(cnt2, LENGTH_DIFF);
4431
4432  bind(SHORT_LOOP);
4433  load_unsigned_short(result, Address(post(str1, 2)));
4434  load_unsigned_short(cnt1, Address(post(str2, 2)));
4435  subw(result, result, cnt1);
4436  cbnz(result, DONE);
4437  sub(cnt2, cnt2, 1);
4438  cbnz(cnt2, SHORT_LOOP);
4439
4440  // Strings are equal up to min length.  Return the length difference.
4441  bind(LENGTH_DIFF);
4442  mov(result, tmp1);
4443
4444  // That's it
4445  bind(DONE);
4446
4447  BLOCK_COMMENT("} string_compare");
4448}
4449
4450
4451void MacroAssembler::string_equals(Register str1, Register str2,
4452                                   Register cnt, Register result,
4453                                   Register tmp1) {
4454  Label SAME_CHARS, DONE, SHORT_LOOP, SHORT_STRING,
4455    NEXT_WORD;
4456
4457  const Register tmp2 = rscratch1;
4458  assert_different_registers(str1, str2, cnt, result, tmp1, tmp2, rscratch2);
4459
4460  BLOCK_COMMENT("string_equals {");
4461
4462  // Start by assuming that the strings are not equal.
4463  mov(result, zr);
4464
4465  // A very short string
4466  cmpw(cnt, 4);
4467  br(Assembler::LT, SHORT_STRING);
4468
4469  // Check if the strings start at the same location.
4470  cmp(str1, str2);
4471  br(Assembler::EQ, SAME_CHARS);
4472
4473  // Compare longwords
4474  {
4475    subw(cnt, cnt, 4); // The last longword is a special case
4476
4477    // Move both string pointers to the last longword of their
4478    // strings, negate the remaining count, and convert it to bytes.
4479    lea(str1, Address(str1, cnt, Address::uxtw(1)));
4480    lea(str2, Address(str2, cnt, Address::uxtw(1)));
4481    sub(cnt, zr, cnt, LSL, 1);
4482
4483    // Loop, loading longwords and comparing them into rscratch2.
4484    bind(NEXT_WORD);
4485    ldr(tmp1, Address(str1, cnt));
4486    ldr(tmp2, Address(str2, cnt));
4487    adds(cnt, cnt, wordSize);
4488    eor(rscratch2, tmp1, tmp2);
4489    cbnz(rscratch2, DONE);
4490    br(Assembler::LT, NEXT_WORD);
4491
4492    // Last longword.  In the case where length == 4 we compare the
4493    // same longword twice, but that's still faster than another
4494    // conditional branch.
4495
4496    ldr(tmp1, Address(str1));
4497    ldr(tmp2, Address(str2));
4498    eor(rscratch2, tmp1, tmp2);
4499    cbz(rscratch2, SAME_CHARS);
4500    b(DONE);
4501  }
4502
4503  bind(SHORT_STRING);
4504  // Is the length zero?
4505  cbz(cnt, SAME_CHARS);
4506
4507  bind(SHORT_LOOP);
4508  load_unsigned_short(tmp1, Address(post(str1, 2)));
4509  load_unsigned_short(tmp2, Address(post(str2, 2)));
4510  subw(tmp1, tmp1, tmp2);
4511  cbnz(tmp1, DONE);
4512  sub(cnt, cnt, 1);
4513  cbnz(cnt, SHORT_LOOP);
4514
4515  // Strings are equal.
4516  bind(SAME_CHARS);
4517  mov(result, true);
4518
4519  // That's it
4520  bind(DONE);
4521
4522  BLOCK_COMMENT("} string_equals");
4523}
4524
4525// Compare char[] arrays aligned to 4 bytes
4526void MacroAssembler::char_arrays_equals(Register ary1, Register ary2,
4527                                        Register result, Register tmp1)
4528{
4529  Register cnt1 = rscratch1;
4530  Register cnt2 = rscratch2;
4531  Register tmp2 = rscratch2;
4532
4533  Label SAME, DIFFER, NEXT, TAIL03, TAIL01;
4534
4535  int length_offset  = arrayOopDesc::length_offset_in_bytes();
4536  int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
4537
4538  BLOCK_COMMENT("char_arrays_equals  {");
4539
4540    // different until proven equal
4541    mov(result, false);
4542
4543    // same array?
4544    cmp(ary1, ary2);
4545    br(Assembler::EQ, SAME);
4546
4547    // ne if either null
4548    cbz(ary1, DIFFER);
4549    cbz(ary2, DIFFER);
4550
4551    // lengths ne?
4552    ldrw(cnt1, Address(ary1, length_offset));
4553    ldrw(cnt2, Address(ary2, length_offset));
4554    cmp(cnt1, cnt2);
4555    br(Assembler::NE, DIFFER);
4556
4557    lea(ary1, Address(ary1, base_offset));
4558    lea(ary2, Address(ary2, base_offset));
4559
4560    subs(cnt1, cnt1, 4);
4561    br(LT, TAIL03);
4562
4563  BIND(NEXT);
4564    ldr(tmp1, Address(post(ary1, 8)));
4565    ldr(tmp2, Address(post(ary2, 8)));
4566    subs(cnt1, cnt1, 4);
4567    eor(tmp1, tmp1, tmp2);
4568    cbnz(tmp1, DIFFER);
4569    br(GE, NEXT);
4570
4571  BIND(TAIL03);  // 0-3 chars left, cnt1 = #chars left - 4
4572    tst(cnt1, 0b10);
4573    br(EQ, TAIL01);
4574    ldrw(tmp1, Address(post(ary1, 4)));
4575    ldrw(tmp2, Address(post(ary2, 4)));
4576    cmp(tmp1, tmp2);
4577    br(NE, DIFFER);
4578  BIND(TAIL01);  // 0-1 chars left
4579    tst(cnt1, 0b01);
4580    br(EQ, SAME);
4581    ldrh(tmp1, ary1);
4582    ldrh(tmp2, ary2);
4583    cmp(tmp1, tmp2);
4584    br(NE, DIFFER);
4585
4586  BIND(SAME);
4587    mov(result, true);
4588  BIND(DIFFER); // result already set
4589
4590  BLOCK_COMMENT("} char_arrays_equals");
4591}
4592
4593// encode char[] to byte[] in ISO_8859_1
4594void MacroAssembler::encode_iso_array(Register src, Register dst,
4595                      Register len, Register result,
4596                      FloatRegister Vtmp1, FloatRegister Vtmp2,
4597                      FloatRegister Vtmp3, FloatRegister Vtmp4)
4598{
4599    Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1;
4600    Register tmp1 = rscratch1;
4601
4602      mov(result, len); // Save initial len
4603
4604#ifndef BUILTIN_SIM
4605      subs(len, len, 32);
4606      br(LT, LOOP_8);
4607
4608// The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions
4609// to convert chars to bytes. These set the 'QC' bit in the FPSR if
4610// any char could not fit in a byte, so clear the FPSR so we can test it.
4611      clear_fpsr();
4612
4613    BIND(NEXT_32);
4614      ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
4615      uqxtn(Vtmp1, T8B, Vtmp1, T8H);  // uqxtn  - write bottom half
4616      uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half
4617      uqxtn(Vtmp2, T8B, Vtmp3, T8H);
4618      uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2
4619      get_fpsr(tmp1);
4620      cbnzw(tmp1, LOOP_8);
4621      st1(Vtmp1, Vtmp2, T16B, post(dst, 32));
4622      subs(len, len, 32);
4623      add(src, src, 64);
4624      br(GE, NEXT_32);
4625
4626    BIND(LOOP_8);
4627      adds(len, len, 32-8);
4628      br(LT, LOOP_1);
4629      clear_fpsr(); // QC may be set from loop above, clear again
4630    BIND(NEXT_8);
4631      ld1(Vtmp1, T8H, src);
4632      uqxtn(Vtmp1, T8B, Vtmp1, T8H);
4633      get_fpsr(tmp1);
4634      cbnzw(tmp1, LOOP_1);
4635      st1(Vtmp1, T8B, post(dst, 8));
4636      subs(len, len, 8);
4637      add(src, src, 16);
4638      br(GE, NEXT_8);
4639
4640    BIND(LOOP_1);
4641      adds(len, len, 8);
4642      br(LE, DONE);
4643#else
4644      cbz(len, DONE);
4645#endif
4646    BIND(NEXT_1);
4647      ldrh(tmp1, Address(post(src, 2)));
4648      tst(tmp1, 0xff00);
4649      br(NE, DONE);
4650      strb(tmp1, Address(post(dst, 1)));
4651      subs(len, len, 1);
4652      br(GT, NEXT_1);
4653
4654    BIND(DONE);
4655      sub(result, result, len); // Return index where we stopped
4656}
4657
4658// get_thread() can be called anywhere inside generated code so we
4659// need to save whatever non-callee save context might get clobbered
4660// by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
4661// the call setup code.
4662//
4663// aarch64_get_thread_helper() clobbers only r0, r1, and flags.
4664//
4665void MacroAssembler::get_thread(Register dst) {
4666  RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
4667  push(saved_regs, sp);
4668
4669  mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
4670  blrt(lr, 1, 0, 1);
4671  if (dst != c_rarg0) {
4672    mov(dst, c_rarg0);
4673  }
4674
4675  pop(saved_regs, sp);
4676}
4677