assembler_x86.cpp revision 3724:8e47bac5643a
1186690Sobrien/*
2186690Sobrien * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
3226048Sobrien * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4186690Sobrien *
5186690Sobrien * This code is free software; you can redistribute it and/or modify it
6226048Sobrien * under the terms of the GNU General Public License version 2 only, as
7186690Sobrien * published by the Free Software Foundation.
8226048Sobrien *
9186690Sobrien * This code is distributed in the hope that it will be useful, but WITHOUT
10226048Sobrien * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11186690Sobrien * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "assembler_x86.inline.hpp"
27#include "gc_interface/collectedHeap.inline.hpp"
28#include "interpreter/interpreter.hpp"
29#include "memory/cardTableModRefBS.hpp"
30#include "memory/resourceArea.hpp"
31#include "prims/methodHandles.hpp"
32#include "runtime/biasedLocking.hpp"
33#include "runtime/interfaceSupport.hpp"
34#include "runtime/objectMonitor.hpp"
35#include "runtime/os.hpp"
36#include "runtime/sharedRuntime.hpp"
37#include "runtime/stubRoutines.hpp"
38#ifndef SERIALGC
39#include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
40#include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
41#include "gc_implementation/g1/heapRegion.hpp"
42#endif
43
44#ifdef PRODUCT
45#define BLOCK_COMMENT(str) /* nothing */
46#define STOP(error) stop(error)
47#else
48#define BLOCK_COMMENT(str) block_comment(str)
49#define STOP(error) block_comment(error); stop(error)
50#endif
51
52#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
53// Implementation of AddressLiteral
54
55AddressLiteral::AddressLiteral(address target, relocInfo::relocType rtype) {
56  _is_lval = false;
57  _target = target;
58  switch (rtype) {
59  case relocInfo::oop_type:
60  case relocInfo::metadata_type:
61    // Oops are a special case. Normally they would be their own section
62    // but in cases like icBuffer they are literals in the code stream that
63    // we don't have a section for. We use none so that we get a literal address
64    // which is always patchable.
65    break;
66  case relocInfo::external_word_type:
67    _rspec = external_word_Relocation::spec(target);
68    break;
69  case relocInfo::internal_word_type:
70    _rspec = internal_word_Relocation::spec(target);
71    break;
72  case relocInfo::opt_virtual_call_type:
73    _rspec = opt_virtual_call_Relocation::spec();
74    break;
75  case relocInfo::static_call_type:
76    _rspec = static_call_Relocation::spec();
77    break;
78  case relocInfo::runtime_call_type:
79    _rspec = runtime_call_Relocation::spec();
80    break;
81  case relocInfo::poll_type:
82  case relocInfo::poll_return_type:
83    _rspec = Relocation::spec_simple(rtype);
84    break;
85  case relocInfo::none:
86    break;
87  default:
88    ShouldNotReachHere();
89    break;
90  }
91}
92
93// Implementation of Address
94
95#ifdef _LP64
96
97Address Address::make_array(ArrayAddress adr) {
98  // Not implementable on 64bit machines
99  // Should have been handled higher up the call chain.
100  ShouldNotReachHere();
101  return Address();
102}
103
104// exceedingly dangerous constructor
105Address::Address(int disp, address loc, relocInfo::relocType rtype) {
106  _base  = noreg;
107  _index = noreg;
108  _scale = no_scale;
109  _disp  = disp;
110  switch (rtype) {
111    case relocInfo::external_word_type:
112      _rspec = external_word_Relocation::spec(loc);
113      break;
114    case relocInfo::internal_word_type:
115      _rspec = internal_word_Relocation::spec(loc);
116      break;
117    case relocInfo::runtime_call_type:
118      // HMM
119      _rspec = runtime_call_Relocation::spec();
120      break;
121    case relocInfo::poll_type:
122    case relocInfo::poll_return_type:
123      _rspec = Relocation::spec_simple(rtype);
124      break;
125    case relocInfo::none:
126      break;
127    default:
128      ShouldNotReachHere();
129  }
130}
131#else // LP64
132
133Address Address::make_array(ArrayAddress adr) {
134  AddressLiteral base = adr.base();
135  Address index = adr.index();
136  assert(index._disp == 0, "must not have disp"); // maybe it can?
137  Address array(index._base, index._index, index._scale, (intptr_t) base.target());
138  array._rspec = base._rspec;
139  return array;
140}
141
142// exceedingly dangerous constructor
143Address::Address(address loc, RelocationHolder spec) {
144  _base  = noreg;
145  _index = noreg;
146  _scale = no_scale;
147  _disp  = (intptr_t) loc;
148  _rspec = spec;
149}
150
151#endif // _LP64
152
153
154
155// Convert the raw encoding form into the form expected by the constructor for
156// Address.  An index of 4 (rsp) corresponds to having no index, so convert
157// that to noreg for the Address constructor.
158Address Address::make_raw(int base, int index, int scale, int disp, relocInfo::relocType disp_reloc) {
159  RelocationHolder rspec;
160  if (disp_reloc != relocInfo::none) {
161    rspec = Relocation::spec_simple(disp_reloc);
162  }
163  bool valid_index = index != rsp->encoding();
164  if (valid_index) {
165    Address madr(as_Register(base), as_Register(index), (Address::ScaleFactor)scale, in_ByteSize(disp));
166    madr._rspec = rspec;
167    return madr;
168  } else {
169    Address madr(as_Register(base), noreg, Address::no_scale, in_ByteSize(disp));
170    madr._rspec = rspec;
171    return madr;
172  }
173}
174
175// Implementation of Assembler
176
177int AbstractAssembler::code_fill_byte() {
178  return (u_char)'\xF4'; // hlt
179}
180
181// make this go away someday
182void Assembler::emit_data(jint data, relocInfo::relocType rtype, int format) {
183  if (rtype == relocInfo::none)
184        emit_long(data);
185  else  emit_data(data, Relocation::spec_simple(rtype), format);
186}
187
188void Assembler::emit_data(jint data, RelocationHolder const& rspec, int format) {
189  assert(imm_operand == 0, "default format must be immediate in this file");
190  assert(inst_mark() != NULL, "must be inside InstructionMark");
191  if (rspec.type() !=  relocInfo::none) {
192    #ifdef ASSERT
193      check_relocation(rspec, format);
194    #endif
195    // Do not use AbstractAssembler::relocate, which is not intended for
196    // embedded words.  Instead, relocate to the enclosing instruction.
197
198    // hack. call32 is too wide for mask so use disp32
199    if (format == call32_operand)
200      code_section()->relocate(inst_mark(), rspec, disp32_operand);
201    else
202      code_section()->relocate(inst_mark(), rspec, format);
203  }
204  emit_long(data);
205}
206
207static int encode(Register r) {
208  int enc = r->encoding();
209  if (enc >= 8) {
210    enc -= 8;
211  }
212  return enc;
213}
214
215static int encode(XMMRegister r) {
216  int enc = r->encoding();
217  if (enc >= 8) {
218    enc -= 8;
219  }
220  return enc;
221}
222
223void Assembler::emit_arith_b(int op1, int op2, Register dst, int imm8) {
224  assert(dst->has_byte_register(), "must have byte register");
225  assert(isByte(op1) && isByte(op2), "wrong opcode");
226  assert(isByte(imm8), "not a byte");
227  assert((op1 & 0x01) == 0, "should be 8bit operation");
228  emit_byte(op1);
229  emit_byte(op2 | encode(dst));
230  emit_byte(imm8);
231}
232
233
234void Assembler::emit_arith(int op1, int op2, Register dst, int32_t imm32) {
235  assert(isByte(op1) && isByte(op2), "wrong opcode");
236  assert((op1 & 0x01) == 1, "should be 32bit operation");
237  assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
238  if (is8bit(imm32)) {
239    emit_byte(op1 | 0x02); // set sign bit
240    emit_byte(op2 | encode(dst));
241    emit_byte(imm32 & 0xFF);
242  } else {
243    emit_byte(op1);
244    emit_byte(op2 | encode(dst));
245    emit_long(imm32);
246  }
247}
248
249// Force generation of a 4 byte immediate value even if it fits into 8bit
250void Assembler::emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32) {
251  assert(isByte(op1) && isByte(op2), "wrong opcode");
252  assert((op1 & 0x01) == 1, "should be 32bit operation");
253  assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
254  emit_byte(op1);
255  emit_byte(op2 | encode(dst));
256  emit_long(imm32);
257}
258
259// immediate-to-memory forms
260void Assembler::emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32) {
261  assert((op1 & 0x01) == 1, "should be 32bit operation");
262  assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
263  if (is8bit(imm32)) {
264    emit_byte(op1 | 0x02); // set sign bit
265    emit_operand(rm, adr, 1);
266    emit_byte(imm32 & 0xFF);
267  } else {
268    emit_byte(op1);
269    emit_operand(rm, adr, 4);
270    emit_long(imm32);
271  }
272}
273
274
275void Assembler::emit_arith(int op1, int op2, Register dst, Register src) {
276  assert(isByte(op1) && isByte(op2), "wrong opcode");
277  emit_byte(op1);
278  emit_byte(op2 | encode(dst) << 3 | encode(src));
279}
280
281
282void Assembler::emit_operand(Register reg, Register base, Register index,
283                             Address::ScaleFactor scale, int disp,
284                             RelocationHolder const& rspec,
285                             int rip_relative_correction) {
286  relocInfo::relocType rtype = (relocInfo::relocType) rspec.type();
287
288  // Encode the registers as needed in the fields they are used in
289
290  int regenc = encode(reg) << 3;
291  int indexenc = index->is_valid() ? encode(index) << 3 : 0;
292  int baseenc = base->is_valid() ? encode(base) : 0;
293
294  if (base->is_valid()) {
295    if (index->is_valid()) {
296      assert(scale != Address::no_scale, "inconsistent address");
297      // [base + index*scale + disp]
298      if (disp == 0 && rtype == relocInfo::none  &&
299          base != rbp LP64_ONLY(&& base != r13)) {
300        // [base + index*scale]
301        // [00 reg 100][ss index base]
302        assert(index != rsp, "illegal addressing mode");
303        emit_byte(0x04 | regenc);
304        emit_byte(scale << 6 | indexenc | baseenc);
305      } else if (is8bit(disp) && rtype == relocInfo::none) {
306        // [base + index*scale + imm8]
307        // [01 reg 100][ss index base] imm8
308        assert(index != rsp, "illegal addressing mode");
309        emit_byte(0x44 | regenc);
310        emit_byte(scale << 6 | indexenc | baseenc);
311        emit_byte(disp & 0xFF);
312      } else {
313        // [base + index*scale + disp32]
314        // [10 reg 100][ss index base] disp32
315        assert(index != rsp, "illegal addressing mode");
316        emit_byte(0x84 | regenc);
317        emit_byte(scale << 6 | indexenc | baseenc);
318        emit_data(disp, rspec, disp32_operand);
319      }
320    } else if (base == rsp LP64_ONLY(|| base == r12)) {
321      // [rsp + disp]
322      if (disp == 0 && rtype == relocInfo::none) {
323        // [rsp]
324        // [00 reg 100][00 100 100]
325        emit_byte(0x04 | regenc);
326        emit_byte(0x24);
327      } else if (is8bit(disp) && rtype == relocInfo::none) {
328        // [rsp + imm8]
329        // [01 reg 100][00 100 100] disp8
330        emit_byte(0x44 | regenc);
331        emit_byte(0x24);
332        emit_byte(disp & 0xFF);
333      } else {
334        // [rsp + imm32]
335        // [10 reg 100][00 100 100] disp32
336        emit_byte(0x84 | regenc);
337        emit_byte(0x24);
338        emit_data(disp, rspec, disp32_operand);
339      }
340    } else {
341      // [base + disp]
342      assert(base != rsp LP64_ONLY(&& base != r12), "illegal addressing mode");
343      if (disp == 0 && rtype == relocInfo::none &&
344          base != rbp LP64_ONLY(&& base != r13)) {
345        // [base]
346        // [00 reg base]
347        emit_byte(0x00 | regenc | baseenc);
348      } else if (is8bit(disp) && rtype == relocInfo::none) {
349        // [base + disp8]
350        // [01 reg base] disp8
351        emit_byte(0x40 | regenc | baseenc);
352        emit_byte(disp & 0xFF);
353      } else {
354        // [base + disp32]
355        // [10 reg base] disp32
356        emit_byte(0x80 | regenc | baseenc);
357        emit_data(disp, rspec, disp32_operand);
358      }
359    }
360  } else {
361    if (index->is_valid()) {
362      assert(scale != Address::no_scale, "inconsistent address");
363      // [index*scale + disp]
364      // [00 reg 100][ss index 101] disp32
365      assert(index != rsp, "illegal addressing mode");
366      emit_byte(0x04 | regenc);
367      emit_byte(scale << 6 | indexenc | 0x05);
368      emit_data(disp, rspec, disp32_operand);
369    } else if (rtype != relocInfo::none ) {
370      // [disp] (64bit) RIP-RELATIVE (32bit) abs
371      // [00 000 101] disp32
372
373      emit_byte(0x05 | regenc);
374      // Note that the RIP-rel. correction applies to the generated
375      // disp field, but _not_ to the target address in the rspec.
376
377      // disp was created by converting the target address minus the pc
378      // at the start of the instruction. That needs more correction here.
379      // intptr_t disp = target - next_ip;
380      assert(inst_mark() != NULL, "must be inside InstructionMark");
381      address next_ip = pc() + sizeof(int32_t) + rip_relative_correction;
382      int64_t adjusted = disp;
383      // Do rip-rel adjustment for 64bit
384      LP64_ONLY(adjusted -=  (next_ip - inst_mark()));
385      assert(is_simm32(adjusted),
386             "must be 32bit offset (RIP relative address)");
387      emit_data((int32_t) adjusted, rspec, disp32_operand);
388
389    } else {
390      // 32bit never did this, did everything as the rip-rel/disp code above
391      // [disp] ABSOLUTE
392      // [00 reg 100][00 100 101] disp32
393      emit_byte(0x04 | regenc);
394      emit_byte(0x25);
395      emit_data(disp, rspec, disp32_operand);
396    }
397  }
398}
399
400void Assembler::emit_operand(XMMRegister reg, Register base, Register index,
401                             Address::ScaleFactor scale, int disp,
402                             RelocationHolder const& rspec) {
403  emit_operand((Register)reg, base, index, scale, disp, rspec);
404}
405
406// Secret local extension to Assembler::WhichOperand:
407#define end_pc_operand (_WhichOperand_limit)
408
409address Assembler::locate_operand(address inst, WhichOperand which) {
410  // Decode the given instruction, and return the address of
411  // an embedded 32-bit operand word.
412
413  // If "which" is disp32_operand, selects the displacement portion
414  // of an effective address specifier.
415  // If "which" is imm64_operand, selects the trailing immediate constant.
416  // If "which" is call32_operand, selects the displacement of a call or jump.
417  // Caller is responsible for ensuring that there is such an operand,
418  // and that it is 32/64 bits wide.
419
420  // If "which" is end_pc_operand, find the end of the instruction.
421
422  address ip = inst;
423  bool is_64bit = false;
424
425  debug_only(bool has_disp32 = false);
426  int tail_size = 0; // other random bytes (#32, #16, etc.) at end of insn
427
428  again_after_prefix:
429  switch (0xFF & *ip++) {
430
431  // These convenience macros generate groups of "case" labels for the switch.
432#define REP4(x) (x)+0: case (x)+1: case (x)+2: case (x)+3
433#define REP8(x) (x)+0: case (x)+1: case (x)+2: case (x)+3: \
434             case (x)+4: case (x)+5: case (x)+6: case (x)+7
435#define REP16(x) REP8((x)+0): \
436              case REP8((x)+8)
437
438  case CS_segment:
439  case SS_segment:
440  case DS_segment:
441  case ES_segment:
442  case FS_segment:
443  case GS_segment:
444    // Seems dubious
445    LP64_ONLY(assert(false, "shouldn't have that prefix"));
446    assert(ip == inst+1, "only one prefix allowed");
447    goto again_after_prefix;
448
449  case 0x67:
450  case REX:
451  case REX_B:
452  case REX_X:
453  case REX_XB:
454  case REX_R:
455  case REX_RB:
456  case REX_RX:
457  case REX_RXB:
458    NOT_LP64(assert(false, "64bit prefixes"));
459    goto again_after_prefix;
460
461  case REX_W:
462  case REX_WB:
463  case REX_WX:
464  case REX_WXB:
465  case REX_WR:
466  case REX_WRB:
467  case REX_WRX:
468  case REX_WRXB:
469    NOT_LP64(assert(false, "64bit prefixes"));
470    is_64bit = true;
471    goto again_after_prefix;
472
473  case 0xFF: // pushq a; decl a; incl a; call a; jmp a
474  case 0x88: // movb a, r
475  case 0x89: // movl a, r
476  case 0x8A: // movb r, a
477  case 0x8B: // movl r, a
478  case 0x8F: // popl a
479    debug_only(has_disp32 = true);
480    break;
481
482  case 0x68: // pushq #32
483    if (which == end_pc_operand) {
484      return ip + 4;
485    }
486    assert(which == imm_operand && !is_64bit, "pushl has no disp32 or 64bit immediate");
487    return ip;                  // not produced by emit_operand
488
489  case 0x66: // movw ... (size prefix)
490    again_after_size_prefix2:
491    switch (0xFF & *ip++) {
492    case REX:
493    case REX_B:
494    case REX_X:
495    case REX_XB:
496    case REX_R:
497    case REX_RB:
498    case REX_RX:
499    case REX_RXB:
500    case REX_W:
501    case REX_WB:
502    case REX_WX:
503    case REX_WXB:
504    case REX_WR:
505    case REX_WRB:
506    case REX_WRX:
507    case REX_WRXB:
508      NOT_LP64(assert(false, "64bit prefix found"));
509      goto again_after_size_prefix2;
510    case 0x8B: // movw r, a
511    case 0x89: // movw a, r
512      debug_only(has_disp32 = true);
513      break;
514    case 0xC7: // movw a, #16
515      debug_only(has_disp32 = true);
516      tail_size = 2;  // the imm16
517      break;
518    case 0x0F: // several SSE/SSE2 variants
519      ip--;    // reparse the 0x0F
520      goto again_after_prefix;
521    default:
522      ShouldNotReachHere();
523    }
524    break;
525
526  case REP8(0xB8): // movl/q r, #32/#64(oop?)
527    if (which == end_pc_operand)  return ip + (is_64bit ? 8 : 4);
528    // these asserts are somewhat nonsensical
529#ifndef _LP64
530    assert(which == imm_operand || which == disp32_operand,
531           err_msg("which %d is_64_bit %d ip " INTPTR_FORMAT, which, is_64bit, ip));
532#else
533    assert((which == call32_operand || which == imm_operand) && is_64bit ||
534           which == narrow_oop_operand && !is_64bit,
535           err_msg("which %d is_64_bit %d ip " INTPTR_FORMAT, which, is_64bit, ip));
536#endif // _LP64
537    return ip;
538
539  case 0x69: // imul r, a, #32
540  case 0xC7: // movl a, #32(oop?)
541    tail_size = 4;
542    debug_only(has_disp32 = true); // has both kinds of operands!
543    break;
544
545  case 0x0F: // movx..., etc.
546    switch (0xFF & *ip++) {
547    case 0x3A: // pcmpestri
548      tail_size = 1;
549    case 0x38: // ptest, pmovzxbw
550      ip++; // skip opcode
551      debug_only(has_disp32 = true); // has both kinds of operands!
552      break;
553
554    case 0x70: // pshufd r, r/a, #8
555      debug_only(has_disp32 = true); // has both kinds of operands!
556    case 0x73: // psrldq r, #8
557      tail_size = 1;
558      break;
559
560    case 0x12: // movlps
561    case 0x28: // movaps
562    case 0x2E: // ucomiss
563    case 0x2F: // comiss
564    case 0x54: // andps
565    case 0x55: // andnps
566    case 0x56: // orps
567    case 0x57: // xorps
568    case 0x6E: // movd
569    case 0x7E: // movd
570    case 0xAE: // ldmxcsr, stmxcsr, fxrstor, fxsave, clflush
571      debug_only(has_disp32 = true);
572      break;
573
574    case 0xAD: // shrd r, a, %cl
575    case 0xAF: // imul r, a
576    case 0xBE: // movsbl r, a (movsxb)
577    case 0xBF: // movswl r, a (movsxw)
578    case 0xB6: // movzbl r, a (movzxb)
579    case 0xB7: // movzwl r, a (movzxw)
580    case REP16(0x40): // cmovl cc, r, a
581    case 0xB0: // cmpxchgb
582    case 0xB1: // cmpxchg
583    case 0xC1: // xaddl
584    case 0xC7: // cmpxchg8
585    case REP16(0x90): // setcc a
586      debug_only(has_disp32 = true);
587      // fall out of the switch to decode the address
588      break;
589
590    case 0xC4: // pinsrw r, a, #8
591      debug_only(has_disp32 = true);
592    case 0xC5: // pextrw r, r, #8
593      tail_size = 1;  // the imm8
594      break;
595
596    case 0xAC: // shrd r, a, #8
597      debug_only(has_disp32 = true);
598      tail_size = 1;  // the imm8
599      break;
600
601    case REP16(0x80): // jcc rdisp32
602      if (which == end_pc_operand)  return ip + 4;
603      assert(which == call32_operand, "jcc has no disp32 or imm");
604      return ip;
605    default:
606      ShouldNotReachHere();
607    }
608    break;
609
610  case 0x81: // addl a, #32; addl r, #32
611    // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
612    // on 32bit in the case of cmpl, the imm might be an oop
613    tail_size = 4;
614    debug_only(has_disp32 = true); // has both kinds of operands!
615    break;
616
617  case 0x83: // addl a, #8; addl r, #8
618    // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
619    debug_only(has_disp32 = true); // has both kinds of operands!
620    tail_size = 1;
621    break;
622
623  case 0x9B:
624    switch (0xFF & *ip++) {
625    case 0xD9: // fnstcw a
626      debug_only(has_disp32 = true);
627      break;
628    default:
629      ShouldNotReachHere();
630    }
631    break;
632
633  case REP4(0x00): // addb a, r; addl a, r; addb r, a; addl r, a
634  case REP4(0x10): // adc...
635  case REP4(0x20): // and...
636  case REP4(0x30): // xor...
637  case REP4(0x08): // or...
638  case REP4(0x18): // sbb...
639  case REP4(0x28): // sub...
640  case 0xF7: // mull a
641  case 0x8D: // lea r, a
642  case 0x87: // xchg r, a
643  case REP4(0x38): // cmp...
644  case 0x85: // test r, a
645    debug_only(has_disp32 = true); // has both kinds of operands!
646    break;
647
648  case 0xC1: // sal a, #8; sar a, #8; shl a, #8; shr a, #8
649  case 0xC6: // movb a, #8
650  case 0x80: // cmpb a, #8
651  case 0x6B: // imul r, a, #8
652    debug_only(has_disp32 = true); // has both kinds of operands!
653    tail_size = 1; // the imm8
654    break;
655
656  case 0xC4: // VEX_3bytes
657  case 0xC5: // VEX_2bytes
658    assert((UseAVX > 0), "shouldn't have VEX prefix");
659    assert(ip == inst+1, "no prefixes allowed");
660    // C4 and C5 are also used as opcodes for PINSRW and PEXTRW instructions
661    // but they have prefix 0x0F and processed when 0x0F processed above.
662    //
663    // In 32-bit mode the VEX first byte C4 and C5 alias onto LDS and LES
664    // instructions (these instructions are not supported in 64-bit mode).
665    // To distinguish them bits [7:6] are set in the VEX second byte since
666    // ModRM byte can not be of the form 11xxxxxx in 32-bit mode. To set
667    // those VEX bits REX and vvvv bits are inverted.
668    //
669    // Fortunately C2 doesn't generate these instructions so we don't need
670    // to check for them in product version.
671
672    // Check second byte
673    NOT_LP64(assert((0xC0 & *ip) == 0xC0, "shouldn't have LDS and LES instructions"));
674
675    // First byte
676    if ((0xFF & *inst) == VEX_3bytes) {
677      ip++; // third byte
678      is_64bit = ((VEX_W & *ip) == VEX_W);
679    }
680    ip++; // opcode
681    // To find the end of instruction (which == end_pc_operand).
682    switch (0xFF & *ip) {
683    case 0x61: // pcmpestri r, r/a, #8
684    case 0x70: // pshufd r, r/a, #8
685    case 0x73: // psrldq r, #8
686      tail_size = 1;  // the imm8
687      break;
688    default:
689      break;
690    }
691    ip++; // skip opcode
692    debug_only(has_disp32 = true); // has both kinds of operands!
693    break;
694
695  case 0xD1: // sal a, 1; sar a, 1; shl a, 1; shr a, 1
696  case 0xD3: // sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl
697  case 0xD9: // fld_s a; fst_s a; fstp_s a; fldcw a
698  case 0xDD: // fld_d a; fst_d a; fstp_d a
699  case 0xDB: // fild_s a; fistp_s a; fld_x a; fstp_x a
700  case 0xDF: // fild_d a; fistp_d a
701  case 0xD8: // fadd_s a; fsubr_s a; fmul_s a; fdivr_s a; fcomp_s a
702  case 0xDC: // fadd_d a; fsubr_d a; fmul_d a; fdivr_d a; fcomp_d a
703  case 0xDE: // faddp_d a; fsubrp_d a; fmulp_d a; fdivrp_d a; fcompp_d a
704    debug_only(has_disp32 = true);
705    break;
706
707  case 0xE8: // call rdisp32
708  case 0xE9: // jmp  rdisp32
709    if (which == end_pc_operand)  return ip + 4;
710    assert(which == call32_operand, "call has no disp32 or imm");
711    return ip;
712
713  case 0xF0:                    // Lock
714    assert(os::is_MP(), "only on MP");
715    goto again_after_prefix;
716
717  case 0xF3:                    // For SSE
718  case 0xF2:                    // For SSE2
719    switch (0xFF & *ip++) {
720    case REX:
721    case REX_B:
722    case REX_X:
723    case REX_XB:
724    case REX_R:
725    case REX_RB:
726    case REX_RX:
727    case REX_RXB:
728    case REX_W:
729    case REX_WB:
730    case REX_WX:
731    case REX_WXB:
732    case REX_WR:
733    case REX_WRB:
734    case REX_WRX:
735    case REX_WRXB:
736      NOT_LP64(assert(false, "found 64bit prefix"));
737      ip++;
738    default:
739      ip++;
740    }
741    debug_only(has_disp32 = true); // has both kinds of operands!
742    break;
743
744  default:
745    ShouldNotReachHere();
746
747#undef REP8
748#undef REP16
749  }
750
751  assert(which != call32_operand, "instruction is not a call, jmp, or jcc");
752#ifdef _LP64
753  assert(which != imm_operand, "instruction is not a movq reg, imm64");
754#else
755  // assert(which != imm_operand || has_imm32, "instruction has no imm32 field");
756  assert(which != imm_operand || has_disp32, "instruction has no imm32 field");
757#endif // LP64
758  assert(which != disp32_operand || has_disp32, "instruction has no disp32 field");
759
760  // parse the output of emit_operand
761  int op2 = 0xFF & *ip++;
762  int base = op2 & 0x07;
763  int op3 = -1;
764  const int b100 = 4;
765  const int b101 = 5;
766  if (base == b100 && (op2 >> 6) != 3) {
767    op3 = 0xFF & *ip++;
768    base = op3 & 0x07;   // refetch the base
769  }
770  // now ip points at the disp (if any)
771
772  switch (op2 >> 6) {
773  case 0:
774    // [00 reg  100][ss index base]
775    // [00 reg  100][00   100  esp]
776    // [00 reg base]
777    // [00 reg  100][ss index  101][disp32]
778    // [00 reg  101]               [disp32]
779
780    if (base == b101) {
781      if (which == disp32_operand)
782        return ip;              // caller wants the disp32
783      ip += 4;                  // skip the disp32
784    }
785    break;
786
787  case 1:
788    // [01 reg  100][ss index base][disp8]
789    // [01 reg  100][00   100  esp][disp8]
790    // [01 reg base]               [disp8]
791    ip += 1;                    // skip the disp8
792    break;
793
794  case 2:
795    // [10 reg  100][ss index base][disp32]
796    // [10 reg  100][00   100  esp][disp32]
797    // [10 reg base]               [disp32]
798    if (which == disp32_operand)
799      return ip;                // caller wants the disp32
800    ip += 4;                    // skip the disp32
801    break;
802
803  case 3:
804    // [11 reg base]  (not a memory addressing mode)
805    break;
806  }
807
808  if (which == end_pc_operand) {
809    return ip + tail_size;
810  }
811
812#ifdef _LP64
813  assert(which == narrow_oop_operand && !is_64bit, "instruction is not a movl adr, imm32");
814#else
815  assert(which == imm_operand, "instruction has only an imm field");
816#endif // LP64
817  return ip;
818}
819
820address Assembler::locate_next_instruction(address inst) {
821  // Secretly share code with locate_operand:
822  return locate_operand(inst, end_pc_operand);
823}
824
825
826#ifdef ASSERT
827void Assembler::check_relocation(RelocationHolder const& rspec, int format) {
828  address inst = inst_mark();
829  assert(inst != NULL && inst < pc(), "must point to beginning of instruction");
830  address opnd;
831
832  Relocation* r = rspec.reloc();
833  if (r->type() == relocInfo::none) {
834    return;
835  } else if (r->is_call() || format == call32_operand) {
836    // assert(format == imm32_operand, "cannot specify a nonzero format");
837    opnd = locate_operand(inst, call32_operand);
838  } else if (r->is_data()) {
839    assert(format == imm_operand || format == disp32_operand
840           LP64_ONLY(|| format == narrow_oop_operand), "format ok");
841    opnd = locate_operand(inst, (WhichOperand)format);
842  } else {
843    assert(format == imm_operand, "cannot specify a format");
844    return;
845  }
846  assert(opnd == pc(), "must put operand where relocs can find it");
847}
848#endif // ASSERT
849
850void Assembler::emit_operand32(Register reg, Address adr) {
851  assert(reg->encoding() < 8, "no extended registers");
852  assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
853  emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
854               adr._rspec);
855}
856
857void Assembler::emit_operand(Register reg, Address adr,
858                             int rip_relative_correction) {
859  emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
860               adr._rspec,
861               rip_relative_correction);
862}
863
864void Assembler::emit_operand(XMMRegister reg, Address adr) {
865  emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
866               adr._rspec);
867}
868
869// MMX operations
870void Assembler::emit_operand(MMXRegister reg, Address adr) {
871  assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
872  emit_operand((Register)reg, adr._base, adr._index, adr._scale, adr._disp, adr._rspec);
873}
874
875// work around gcc (3.2.1-7a) bug
876void Assembler::emit_operand(Address adr, MMXRegister reg) {
877  assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
878  emit_operand((Register)reg, adr._base, adr._index, adr._scale, adr._disp, adr._rspec);
879}
880
881
882void Assembler::emit_farith(int b1, int b2, int i) {
883  assert(isByte(b1) && isByte(b2), "wrong opcode");
884  assert(0 <= i &&  i < 8, "illegal stack offset");
885  emit_byte(b1);
886  emit_byte(b2 + i);
887}
888
889
890// Now the Assembler instructions (identical for 32/64 bits)
891
892void Assembler::adcl(Address dst, int32_t imm32) {
893  InstructionMark im(this);
894  prefix(dst);
895  emit_arith_operand(0x81, rdx, dst, imm32);
896}
897
898void Assembler::adcl(Address dst, Register src) {
899  InstructionMark im(this);
900  prefix(dst, src);
901  emit_byte(0x11);
902  emit_operand(src, dst);
903}
904
905void Assembler::adcl(Register dst, int32_t imm32) {
906  prefix(dst);
907  emit_arith(0x81, 0xD0, dst, imm32);
908}
909
910void Assembler::adcl(Register dst, Address src) {
911  InstructionMark im(this);
912  prefix(src, dst);
913  emit_byte(0x13);
914  emit_operand(dst, src);
915}
916
917void Assembler::adcl(Register dst, Register src) {
918  (void) prefix_and_encode(dst->encoding(), src->encoding());
919  emit_arith(0x13, 0xC0, dst, src);
920}
921
922void Assembler::addl(Address dst, int32_t imm32) {
923  InstructionMark im(this);
924  prefix(dst);
925  emit_arith_operand(0x81, rax, dst, imm32);
926}
927
928void Assembler::addl(Address dst, Register src) {
929  InstructionMark im(this);
930  prefix(dst, src);
931  emit_byte(0x01);
932  emit_operand(src, dst);
933}
934
935void Assembler::addl(Register dst, int32_t imm32) {
936  prefix(dst);
937  emit_arith(0x81, 0xC0, dst, imm32);
938}
939
940void Assembler::addl(Register dst, Address src) {
941  InstructionMark im(this);
942  prefix(src, dst);
943  emit_byte(0x03);
944  emit_operand(dst, src);
945}
946
947void Assembler::addl(Register dst, Register src) {
948  (void) prefix_and_encode(dst->encoding(), src->encoding());
949  emit_arith(0x03, 0xC0, dst, src);
950}
951
952void Assembler::addr_nop_4() {
953  assert(UseAddressNop, "no CPU support");
954  // 4 bytes: NOP DWORD PTR [EAX+0]
955  emit_byte(0x0F);
956  emit_byte(0x1F);
957  emit_byte(0x40); // emit_rm(cbuf, 0x1, EAX_enc, EAX_enc);
958  emit_byte(0);    // 8-bits offset (1 byte)
959}
960
961void Assembler::addr_nop_5() {
962  assert(UseAddressNop, "no CPU support");
963  // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
964  emit_byte(0x0F);
965  emit_byte(0x1F);
966  emit_byte(0x44); // emit_rm(cbuf, 0x1, EAX_enc, 0x4);
967  emit_byte(0x00); // emit_rm(cbuf, 0x0, EAX_enc, EAX_enc);
968  emit_byte(0);    // 8-bits offset (1 byte)
969}
970
971void Assembler::addr_nop_7() {
972  assert(UseAddressNop, "no CPU support");
973  // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
974  emit_byte(0x0F);
975  emit_byte(0x1F);
976  emit_byte(0x80); // emit_rm(cbuf, 0x2, EAX_enc, EAX_enc);
977  emit_long(0);    // 32-bits offset (4 bytes)
978}
979
980void Assembler::addr_nop_8() {
981  assert(UseAddressNop, "no CPU support");
982  // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
983  emit_byte(0x0F);
984  emit_byte(0x1F);
985  emit_byte(0x84); // emit_rm(cbuf, 0x2, EAX_enc, 0x4);
986  emit_byte(0x00); // emit_rm(cbuf, 0x0, EAX_enc, EAX_enc);
987  emit_long(0);    // 32-bits offset (4 bytes)
988}
989
990void Assembler::addsd(XMMRegister dst, XMMRegister src) {
991  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
992  emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
993}
994
995void Assembler::addsd(XMMRegister dst, Address src) {
996  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
997  emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
998}
999
1000void Assembler::addss(XMMRegister dst, XMMRegister src) {
1001  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1002  emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
1003}
1004
1005void Assembler::addss(XMMRegister dst, Address src) {
1006  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1007  emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
1008}
1009
1010void Assembler::andl(Address dst, int32_t imm32) {
1011  InstructionMark im(this);
1012  prefix(dst);
1013  emit_byte(0x81);
1014  emit_operand(rsp, dst, 4);
1015  emit_long(imm32);
1016}
1017
1018void Assembler::andl(Register dst, int32_t imm32) {
1019  prefix(dst);
1020  emit_arith(0x81, 0xE0, dst, imm32);
1021}
1022
1023void Assembler::andl(Register dst, Address src) {
1024  InstructionMark im(this);
1025  prefix(src, dst);
1026  emit_byte(0x23);
1027  emit_operand(dst, src);
1028}
1029
1030void Assembler::andl(Register dst, Register src) {
1031  (void) prefix_and_encode(dst->encoding(), src->encoding());
1032  emit_arith(0x23, 0xC0, dst, src);
1033}
1034
1035void Assembler::bsfl(Register dst, Register src) {
1036  int encode = prefix_and_encode(dst->encoding(), src->encoding());
1037  emit_byte(0x0F);
1038  emit_byte(0xBC);
1039  emit_byte(0xC0 | encode);
1040}
1041
1042void Assembler::bsrl(Register dst, Register src) {
1043  assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
1044  int encode = prefix_and_encode(dst->encoding(), src->encoding());
1045  emit_byte(0x0F);
1046  emit_byte(0xBD);
1047  emit_byte(0xC0 | encode);
1048}
1049
1050void Assembler::bswapl(Register reg) { // bswap
1051  int encode = prefix_and_encode(reg->encoding());
1052  emit_byte(0x0F);
1053  emit_byte(0xC8 | encode);
1054}
1055
1056void Assembler::call(Label& L, relocInfo::relocType rtype) {
1057  // suspect disp32 is always good
1058  int operand = LP64_ONLY(disp32_operand) NOT_LP64(imm_operand);
1059
1060  if (L.is_bound()) {
1061    const int long_size = 5;
1062    int offs = (int)( target(L) - pc() );
1063    assert(offs <= 0, "assembler error");
1064    InstructionMark im(this);
1065    // 1110 1000 #32-bit disp
1066    emit_byte(0xE8);
1067    emit_data(offs - long_size, rtype, operand);
1068  } else {
1069    InstructionMark im(this);
1070    // 1110 1000 #32-bit disp
1071    L.add_patch_at(code(), locator());
1072
1073    emit_byte(0xE8);
1074    emit_data(int(0), rtype, operand);
1075  }
1076}
1077
1078void Assembler::call(Register dst) {
1079  int encode = prefix_and_encode(dst->encoding());
1080  emit_byte(0xFF);
1081  emit_byte(0xD0 | encode);
1082}
1083
1084
1085void Assembler::call(Address adr) {
1086  InstructionMark im(this);
1087  prefix(adr);
1088  emit_byte(0xFF);
1089  emit_operand(rdx, adr);
1090}
1091
1092void Assembler::call_literal(address entry, RelocationHolder const& rspec) {
1093  assert(entry != NULL, "call most probably wrong");
1094  InstructionMark im(this);
1095  emit_byte(0xE8);
1096  intptr_t disp = entry - (_code_pos + sizeof(int32_t));
1097  assert(is_simm32(disp), "must be 32bit offset (call2)");
1098  // Technically, should use call32_operand, but this format is
1099  // implied by the fact that we're emitting a call instruction.
1100
1101  int operand = LP64_ONLY(disp32_operand) NOT_LP64(call32_operand);
1102  emit_data((int) disp, rspec, operand);
1103}
1104
1105void Assembler::cdql() {
1106  emit_byte(0x99);
1107}
1108
1109void Assembler::cmovl(Condition cc, Register dst, Register src) {
1110  NOT_LP64(guarantee(VM_Version::supports_cmov(), "illegal instruction"));
1111  int encode = prefix_and_encode(dst->encoding(), src->encoding());
1112  emit_byte(0x0F);
1113  emit_byte(0x40 | cc);
1114  emit_byte(0xC0 | encode);
1115}
1116
1117
1118void Assembler::cmovl(Condition cc, Register dst, Address src) {
1119  NOT_LP64(guarantee(VM_Version::supports_cmov(), "illegal instruction"));
1120  prefix(src, dst);
1121  emit_byte(0x0F);
1122  emit_byte(0x40 | cc);
1123  emit_operand(dst, src);
1124}
1125
1126void Assembler::cmpb(Address dst, int imm8) {
1127  InstructionMark im(this);
1128  prefix(dst);
1129  emit_byte(0x80);
1130  emit_operand(rdi, dst, 1);
1131  emit_byte(imm8);
1132}
1133
1134void Assembler::cmpl(Address dst, int32_t imm32) {
1135  InstructionMark im(this);
1136  prefix(dst);
1137  emit_byte(0x81);
1138  emit_operand(rdi, dst, 4);
1139  emit_long(imm32);
1140}
1141
1142void Assembler::cmpl(Register dst, int32_t imm32) {
1143  prefix(dst);
1144  emit_arith(0x81, 0xF8, dst, imm32);
1145}
1146
1147void Assembler::cmpl(Register dst, Register src) {
1148  (void) prefix_and_encode(dst->encoding(), src->encoding());
1149  emit_arith(0x3B, 0xC0, dst, src);
1150}
1151
1152
1153void Assembler::cmpl(Register dst, Address  src) {
1154  InstructionMark im(this);
1155  prefix(src, dst);
1156  emit_byte(0x3B);
1157  emit_operand(dst, src);
1158}
1159
1160void Assembler::cmpw(Address dst, int imm16) {
1161  InstructionMark im(this);
1162  assert(!dst.base_needs_rex() && !dst.index_needs_rex(), "no extended registers");
1163  emit_byte(0x66);
1164  emit_byte(0x81);
1165  emit_operand(rdi, dst, 2);
1166  emit_word(imm16);
1167}
1168
1169// The 32-bit cmpxchg compares the value at adr with the contents of rax,
1170// and stores reg into adr if so; otherwise, the value at adr is loaded into rax,.
1171// The ZF is set if the compared values were equal, and cleared otherwise.
1172void Assembler::cmpxchgl(Register reg, Address adr) { // cmpxchg
1173  if (Atomics & 2) {
1174     // caveat: no instructionmark, so this isn't relocatable.
1175     // Emit a synthetic, non-atomic, CAS equivalent.
1176     // Beware.  The synthetic form sets all ICCs, not just ZF.
1177     // cmpxchg r,[m] is equivalent to rax, = CAS (m, rax, r)
1178     cmpl(rax, adr);
1179     movl(rax, adr);
1180     if (reg != rax) {
1181        Label L ;
1182        jcc(Assembler::notEqual, L);
1183        movl(adr, reg);
1184        bind(L);
1185     }
1186  } else {
1187     InstructionMark im(this);
1188     prefix(adr, reg);
1189     emit_byte(0x0F);
1190     emit_byte(0xB1);
1191     emit_operand(reg, adr);
1192  }
1193}
1194
1195void Assembler::comisd(XMMRegister dst, Address src) {
1196  // NOTE: dbx seems to decode this as comiss even though the
1197  // 0x66 is there. Strangly ucomisd comes out correct
1198  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1199  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
1200}
1201
1202void Assembler::comisd(XMMRegister dst, XMMRegister src) {
1203  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1204  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
1205}
1206
1207void Assembler::comiss(XMMRegister dst, Address src) {
1208  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1209  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE);
1210}
1211
1212void Assembler::comiss(XMMRegister dst, XMMRegister src) {
1213  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1214  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE);
1215}
1216
1217void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
1218  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1219  emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3);
1220}
1221
1222void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) {
1223  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1224  emit_simd_arith_nonds(0x5B, dst, src, VEX_SIMD_NONE);
1225}
1226
1227void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
1228  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1229  emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
1230}
1231
1232void Assembler::cvtsd2ss(XMMRegister dst, Address src) {
1233  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1234  emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
1235}
1236
1237void Assembler::cvtsi2sdl(XMMRegister dst, Register src) {
1238  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1239  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1240  emit_byte(0x2A);
1241  emit_byte(0xC0 | encode);
1242}
1243
1244void Assembler::cvtsi2sdl(XMMRegister dst, Address src) {
1245  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1246  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2);
1247}
1248
1249void Assembler::cvtsi2ssl(XMMRegister dst, Register src) {
1250  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1251  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1252  emit_byte(0x2A);
1253  emit_byte(0xC0 | encode);
1254}
1255
1256void Assembler::cvtsi2ssl(XMMRegister dst, Address src) {
1257  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1258  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3);
1259}
1260
1261void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
1262  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1263  emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
1264}
1265
1266void Assembler::cvtss2sd(XMMRegister dst, Address src) {
1267  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1268  emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
1269}
1270
1271
1272void Assembler::cvttsd2sil(Register dst, XMMRegister src) {
1273  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1274  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
1275  emit_byte(0x2C);
1276  emit_byte(0xC0 | encode);
1277}
1278
1279void Assembler::cvttss2sil(Register dst, XMMRegister src) {
1280  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1281  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
1282  emit_byte(0x2C);
1283  emit_byte(0xC0 | encode);
1284}
1285
1286void Assembler::decl(Address dst) {
1287  // Don't use it directly. Use MacroAssembler::decrement() instead.
1288  InstructionMark im(this);
1289  prefix(dst);
1290  emit_byte(0xFF);
1291  emit_operand(rcx, dst);
1292}
1293
1294void Assembler::divsd(XMMRegister dst, Address src) {
1295  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1296  emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
1297}
1298
1299void Assembler::divsd(XMMRegister dst, XMMRegister src) {
1300  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1301  emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
1302}
1303
1304void Assembler::divss(XMMRegister dst, Address src) {
1305  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1306  emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3);
1307}
1308
1309void Assembler::divss(XMMRegister dst, XMMRegister src) {
1310  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1311  emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3);
1312}
1313
1314void Assembler::emms() {
1315  NOT_LP64(assert(VM_Version::supports_mmx(), ""));
1316  emit_byte(0x0F);
1317  emit_byte(0x77);
1318}
1319
1320void Assembler::hlt() {
1321  emit_byte(0xF4);
1322}
1323
1324void Assembler::idivl(Register src) {
1325  int encode = prefix_and_encode(src->encoding());
1326  emit_byte(0xF7);
1327  emit_byte(0xF8 | encode);
1328}
1329
1330void Assembler::divl(Register src) { // Unsigned
1331  int encode = prefix_and_encode(src->encoding());
1332  emit_byte(0xF7);
1333  emit_byte(0xF0 | encode);
1334}
1335
1336void Assembler::imull(Register dst, Register src) {
1337  int encode = prefix_and_encode(dst->encoding(), src->encoding());
1338  emit_byte(0x0F);
1339  emit_byte(0xAF);
1340  emit_byte(0xC0 | encode);
1341}
1342
1343
1344void Assembler::imull(Register dst, Register src, int value) {
1345  int encode = prefix_and_encode(dst->encoding(), src->encoding());
1346  if (is8bit(value)) {
1347    emit_byte(0x6B);
1348    emit_byte(0xC0 | encode);
1349    emit_byte(value & 0xFF);
1350  } else {
1351    emit_byte(0x69);
1352    emit_byte(0xC0 | encode);
1353    emit_long(value);
1354  }
1355}
1356
1357void Assembler::incl(Address dst) {
1358  // Don't use it directly. Use MacroAssembler::increment() instead.
1359  InstructionMark im(this);
1360  prefix(dst);
1361  emit_byte(0xFF);
1362  emit_operand(rax, dst);
1363}
1364
1365void Assembler::jcc(Condition cc, Label& L, bool maybe_short) {
1366  InstructionMark im(this);
1367  assert((0 <= cc) && (cc < 16), "illegal cc");
1368  if (L.is_bound()) {
1369    address dst = target(L);
1370    assert(dst != NULL, "jcc most probably wrong");
1371
1372    const int short_size = 2;
1373    const int long_size = 6;
1374    intptr_t offs = (intptr_t)dst - (intptr_t)_code_pos;
1375    if (maybe_short && is8bit(offs - short_size)) {
1376      // 0111 tttn #8-bit disp
1377      emit_byte(0x70 | cc);
1378      emit_byte((offs - short_size) & 0xFF);
1379    } else {
1380      // 0000 1111 1000 tttn #32-bit disp
1381      assert(is_simm32(offs - long_size),
1382             "must be 32bit offset (call4)");
1383      emit_byte(0x0F);
1384      emit_byte(0x80 | cc);
1385      emit_long(offs - long_size);
1386    }
1387  } else {
1388    // Note: could eliminate cond. jumps to this jump if condition
1389    //       is the same however, seems to be rather unlikely case.
1390    // Note: use jccb() if label to be bound is very close to get
1391    //       an 8-bit displacement
1392    L.add_patch_at(code(), locator());
1393    emit_byte(0x0F);
1394    emit_byte(0x80 | cc);
1395    emit_long(0);
1396  }
1397}
1398
1399void Assembler::jccb(Condition cc, Label& L) {
1400  if (L.is_bound()) {
1401    const int short_size = 2;
1402    address entry = target(L);
1403#ifdef ASSERT
1404    intptr_t dist = (intptr_t)entry - ((intptr_t)_code_pos + short_size);
1405    intptr_t delta = short_branch_delta();
1406    if (delta != 0) {
1407      dist += (dist < 0 ? (-delta) :delta);
1408    }
1409    assert(is8bit(dist), "Dispacement too large for a short jmp");
1410#endif
1411    intptr_t offs = (intptr_t)entry - (intptr_t)_code_pos;
1412    // 0111 tttn #8-bit disp
1413    emit_byte(0x70 | cc);
1414    emit_byte((offs - short_size) & 0xFF);
1415  } else {
1416    InstructionMark im(this);
1417    L.add_patch_at(code(), locator());
1418    emit_byte(0x70 | cc);
1419    emit_byte(0);
1420  }
1421}
1422
1423void Assembler::jmp(Address adr) {
1424  InstructionMark im(this);
1425  prefix(adr);
1426  emit_byte(0xFF);
1427  emit_operand(rsp, adr);
1428}
1429
1430void Assembler::jmp(Label& L, bool maybe_short) {
1431  if (L.is_bound()) {
1432    address entry = target(L);
1433    assert(entry != NULL, "jmp most probably wrong");
1434    InstructionMark im(this);
1435    const int short_size = 2;
1436    const int long_size = 5;
1437    intptr_t offs = entry - _code_pos;
1438    if (maybe_short && is8bit(offs - short_size)) {
1439      emit_byte(0xEB);
1440      emit_byte((offs - short_size) & 0xFF);
1441    } else {
1442      emit_byte(0xE9);
1443      emit_long(offs - long_size);
1444    }
1445  } else {
1446    // By default, forward jumps are always 32-bit displacements, since
1447    // we can't yet know where the label will be bound.  If you're sure that
1448    // the forward jump will not run beyond 256 bytes, use jmpb to
1449    // force an 8-bit displacement.
1450    InstructionMark im(this);
1451    L.add_patch_at(code(), locator());
1452    emit_byte(0xE9);
1453    emit_long(0);
1454  }
1455}
1456
1457void Assembler::jmp(Register entry) {
1458  int encode = prefix_and_encode(entry->encoding());
1459  emit_byte(0xFF);
1460  emit_byte(0xE0 | encode);
1461}
1462
1463void Assembler::jmp_literal(address dest, RelocationHolder const& rspec) {
1464  InstructionMark im(this);
1465  emit_byte(0xE9);
1466  assert(dest != NULL, "must have a target");
1467  intptr_t disp = dest - (_code_pos + sizeof(int32_t));
1468  assert(is_simm32(disp), "must be 32bit offset (jmp)");
1469  emit_data(disp, rspec.reloc(), call32_operand);
1470}
1471
1472void Assembler::jmpb(Label& L) {
1473  if (L.is_bound()) {
1474    const int short_size = 2;
1475    address entry = target(L);
1476    assert(entry != NULL, "jmp most probably wrong");
1477#ifdef ASSERT
1478    intptr_t dist = (intptr_t)entry - ((intptr_t)_code_pos + short_size);
1479    intptr_t delta = short_branch_delta();
1480    if (delta != 0) {
1481      dist += (dist < 0 ? (-delta) :delta);
1482    }
1483    assert(is8bit(dist), "Dispacement too large for a short jmp");
1484#endif
1485    intptr_t offs = entry - _code_pos;
1486    emit_byte(0xEB);
1487    emit_byte((offs - short_size) & 0xFF);
1488  } else {
1489    InstructionMark im(this);
1490    L.add_patch_at(code(), locator());
1491    emit_byte(0xEB);
1492    emit_byte(0);
1493  }
1494}
1495
1496void Assembler::ldmxcsr( Address src) {
1497  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1498  InstructionMark im(this);
1499  prefix(src);
1500  emit_byte(0x0F);
1501  emit_byte(0xAE);
1502  emit_operand(as_Register(2), src);
1503}
1504
1505void Assembler::leal(Register dst, Address src) {
1506  InstructionMark im(this);
1507#ifdef _LP64
1508  emit_byte(0x67); // addr32
1509  prefix(src, dst);
1510#endif // LP64
1511  emit_byte(0x8D);
1512  emit_operand(dst, src);
1513}
1514
1515void Assembler::lock() {
1516  if (Atomics & 1) {
1517     // Emit either nothing, a NOP, or a NOP: prefix
1518     emit_byte(0x90) ;
1519  } else {
1520     emit_byte(0xF0);
1521  }
1522}
1523
1524void Assembler::lzcntl(Register dst, Register src) {
1525  assert(VM_Version::supports_lzcnt(), "encoding is treated as BSR");
1526  emit_byte(0xF3);
1527  int encode = prefix_and_encode(dst->encoding(), src->encoding());
1528  emit_byte(0x0F);
1529  emit_byte(0xBD);
1530  emit_byte(0xC0 | encode);
1531}
1532
1533// Emit mfence instruction
1534void Assembler::mfence() {
1535  NOT_LP64(assert(VM_Version::supports_sse2(), "unsupported");)
1536  emit_byte( 0x0F );
1537  emit_byte( 0xAE );
1538  emit_byte( 0xF0 );
1539}
1540
1541void Assembler::mov(Register dst, Register src) {
1542  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
1543}
1544
1545void Assembler::movapd(XMMRegister dst, XMMRegister src) {
1546  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1547  emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_66);
1548}
1549
1550void Assembler::movaps(XMMRegister dst, XMMRegister src) {
1551  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1552  emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_NONE);
1553}
1554
1555void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
1556  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1557  int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE);
1558  emit_byte(0x16);
1559  emit_byte(0xC0 | encode);
1560}
1561
1562void Assembler::movb(Register dst, Address src) {
1563  NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
1564  InstructionMark im(this);
1565  prefix(src, dst, true);
1566  emit_byte(0x8A);
1567  emit_operand(dst, src);
1568}
1569
1570
1571void Assembler::movb(Address dst, int imm8) {
1572  InstructionMark im(this);
1573   prefix(dst);
1574  emit_byte(0xC6);
1575  emit_operand(rax, dst, 1);
1576  emit_byte(imm8);
1577}
1578
1579
1580void Assembler::movb(Address dst, Register src) {
1581  assert(src->has_byte_register(), "must have byte register");
1582  InstructionMark im(this);
1583  prefix(dst, src, true);
1584  emit_byte(0x88);
1585  emit_operand(src, dst);
1586}
1587
1588void Assembler::movdl(XMMRegister dst, Register src) {
1589  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1590  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
1591  emit_byte(0x6E);
1592  emit_byte(0xC0 | encode);
1593}
1594
1595void Assembler::movdl(Register dst, XMMRegister src) {
1596  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1597  // swap src/dst to get correct prefix
1598  int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66);
1599  emit_byte(0x7E);
1600  emit_byte(0xC0 | encode);
1601}
1602
1603void Assembler::movdl(XMMRegister dst, Address src) {
1604  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1605  InstructionMark im(this);
1606  simd_prefix(dst, src, VEX_SIMD_66);
1607  emit_byte(0x6E);
1608  emit_operand(dst, src);
1609}
1610
1611void Assembler::movdl(Address dst, XMMRegister src) {
1612  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1613  InstructionMark im(this);
1614  simd_prefix(dst, src, VEX_SIMD_66);
1615  emit_byte(0x7E);
1616  emit_operand(src, dst);
1617}
1618
1619void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
1620  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1621  emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66);
1622}
1623
1624void Assembler::movdqu(XMMRegister dst, Address src) {
1625  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1626  emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
1627}
1628
1629void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
1630  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1631  emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
1632}
1633
1634void Assembler::movdqu(Address dst, XMMRegister src) {
1635  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1636  InstructionMark im(this);
1637  simd_prefix(dst, src, VEX_SIMD_F3);
1638  emit_byte(0x7F);
1639  emit_operand(src, dst);
1640}
1641
1642// Move Unaligned 256bit Vector
1643void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
1644  assert(UseAVX, "");
1645  bool vector256 = true;
1646  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector256);
1647  emit_byte(0x6F);
1648  emit_byte(0xC0 | encode);
1649}
1650
1651void Assembler::vmovdqu(XMMRegister dst, Address src) {
1652  assert(UseAVX, "");
1653  InstructionMark im(this);
1654  bool vector256 = true;
1655  vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector256);
1656  emit_byte(0x6F);
1657  emit_operand(dst, src);
1658}
1659
1660void Assembler::vmovdqu(Address dst, XMMRegister src) {
1661  assert(UseAVX, "");
1662  InstructionMark im(this);
1663  bool vector256 = true;
1664  // swap src<->dst for encoding
1665  assert(src != xnoreg, "sanity");
1666  vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector256);
1667  emit_byte(0x7F);
1668  emit_operand(src, dst);
1669}
1670
1671// Uses zero extension on 64bit
1672
1673void Assembler::movl(Register dst, int32_t imm32) {
1674  int encode = prefix_and_encode(dst->encoding());
1675  emit_byte(0xB8 | encode);
1676  emit_long(imm32);
1677}
1678
1679void Assembler::movl(Register dst, Register src) {
1680  int encode = prefix_and_encode(dst->encoding(), src->encoding());
1681  emit_byte(0x8B);
1682  emit_byte(0xC0 | encode);
1683}
1684
1685void Assembler::movl(Register dst, Address src) {
1686  InstructionMark im(this);
1687  prefix(src, dst);
1688  emit_byte(0x8B);
1689  emit_operand(dst, src);
1690}
1691
1692void Assembler::movl(Address dst, int32_t imm32) {
1693  InstructionMark im(this);
1694  prefix(dst);
1695  emit_byte(0xC7);
1696  emit_operand(rax, dst, 4);
1697  emit_long(imm32);
1698}
1699
1700void Assembler::movl(Address dst, Register src) {
1701  InstructionMark im(this);
1702  prefix(dst, src);
1703  emit_byte(0x89);
1704  emit_operand(src, dst);
1705}
1706
1707// New cpus require to use movsd and movss to avoid partial register stall
1708// when loading from memory. But for old Opteron use movlpd instead of movsd.
1709// The selection is done in MacroAssembler::movdbl() and movflt().
1710void Assembler::movlpd(XMMRegister dst, Address src) {
1711  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1712  emit_simd_arith(0x12, dst, src, VEX_SIMD_66);
1713}
1714
1715void Assembler::movq( MMXRegister dst, Address src ) {
1716  assert( VM_Version::supports_mmx(), "" );
1717  emit_byte(0x0F);
1718  emit_byte(0x6F);
1719  emit_operand(dst, src);
1720}
1721
1722void Assembler::movq( Address dst, MMXRegister src ) {
1723  assert( VM_Version::supports_mmx(), "" );
1724  emit_byte(0x0F);
1725  emit_byte(0x7F);
1726  // workaround gcc (3.2.1-7a) bug
1727  // In that version of gcc with only an emit_operand(MMX, Address)
1728  // gcc will tail jump and try and reverse the parameters completely
1729  // obliterating dst in the process. By having a version available
1730  // that doesn't need to swap the args at the tail jump the bug is
1731  // avoided.
1732  emit_operand(dst, src);
1733}
1734
1735void Assembler::movq(XMMRegister dst, Address src) {
1736  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1737  InstructionMark im(this);
1738  simd_prefix(dst, src, VEX_SIMD_F3);
1739  emit_byte(0x7E);
1740  emit_operand(dst, src);
1741}
1742
1743void Assembler::movq(Address dst, XMMRegister src) {
1744  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1745  InstructionMark im(this);
1746  simd_prefix(dst, src, VEX_SIMD_66);
1747  emit_byte(0xD6);
1748  emit_operand(src, dst);
1749}
1750
1751void Assembler::movsbl(Register dst, Address src) { // movsxb
1752  InstructionMark im(this);
1753  prefix(src, dst);
1754  emit_byte(0x0F);
1755  emit_byte(0xBE);
1756  emit_operand(dst, src);
1757}
1758
1759void Assembler::movsbl(Register dst, Register src) { // movsxb
1760  NOT_LP64(assert(src->has_byte_register(), "must have byte register"));
1761  int encode = prefix_and_encode(dst->encoding(), src->encoding(), true);
1762  emit_byte(0x0F);
1763  emit_byte(0xBE);
1764  emit_byte(0xC0 | encode);
1765}
1766
1767void Assembler::movsd(XMMRegister dst, XMMRegister src) {
1768  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1769  emit_simd_arith(0x10, dst, src, VEX_SIMD_F2);
1770}
1771
1772void Assembler::movsd(XMMRegister dst, Address src) {
1773  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1774  emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F2);
1775}
1776
1777void Assembler::movsd(Address dst, XMMRegister src) {
1778  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1779  InstructionMark im(this);
1780  simd_prefix(dst, src, VEX_SIMD_F2);
1781  emit_byte(0x11);
1782  emit_operand(src, dst);
1783}
1784
1785void Assembler::movss(XMMRegister dst, XMMRegister src) {
1786  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1787  emit_simd_arith(0x10, dst, src, VEX_SIMD_F3);
1788}
1789
1790void Assembler::movss(XMMRegister dst, Address src) {
1791  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1792  emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3);
1793}
1794
1795void Assembler::movss(Address dst, XMMRegister src) {
1796  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1797  InstructionMark im(this);
1798  simd_prefix(dst, src, VEX_SIMD_F3);
1799  emit_byte(0x11);
1800  emit_operand(src, dst);
1801}
1802
1803void Assembler::movswl(Register dst, Address src) { // movsxw
1804  InstructionMark im(this);
1805  prefix(src, dst);
1806  emit_byte(0x0F);
1807  emit_byte(0xBF);
1808  emit_operand(dst, src);
1809}
1810
1811void Assembler::movswl(Register dst, Register src) { // movsxw
1812  int encode = prefix_and_encode(dst->encoding(), src->encoding());
1813  emit_byte(0x0F);
1814  emit_byte(0xBF);
1815  emit_byte(0xC0 | encode);
1816}
1817
1818void Assembler::movw(Address dst, int imm16) {
1819  InstructionMark im(this);
1820
1821  emit_byte(0x66); // switch to 16-bit mode
1822  prefix(dst);
1823  emit_byte(0xC7);
1824  emit_operand(rax, dst, 2);
1825  emit_word(imm16);
1826}
1827
1828void Assembler::movw(Register dst, Address src) {
1829  InstructionMark im(this);
1830  emit_byte(0x66);
1831  prefix(src, dst);
1832  emit_byte(0x8B);
1833  emit_operand(dst, src);
1834}
1835
1836void Assembler::movw(Address dst, Register src) {
1837  InstructionMark im(this);
1838  emit_byte(0x66);
1839  prefix(dst, src);
1840  emit_byte(0x89);
1841  emit_operand(src, dst);
1842}
1843
1844void Assembler::movzbl(Register dst, Address src) { // movzxb
1845  InstructionMark im(this);
1846  prefix(src, dst);
1847  emit_byte(0x0F);
1848  emit_byte(0xB6);
1849  emit_operand(dst, src);
1850}
1851
1852void Assembler::movzbl(Register dst, Register src) { // movzxb
1853  NOT_LP64(assert(src->has_byte_register(), "must have byte register"));
1854  int encode = prefix_and_encode(dst->encoding(), src->encoding(), true);
1855  emit_byte(0x0F);
1856  emit_byte(0xB6);
1857  emit_byte(0xC0 | encode);
1858}
1859
1860void Assembler::movzwl(Register dst, Address src) { // movzxw
1861  InstructionMark im(this);
1862  prefix(src, dst);
1863  emit_byte(0x0F);
1864  emit_byte(0xB7);
1865  emit_operand(dst, src);
1866}
1867
1868void Assembler::movzwl(Register dst, Register src) { // movzxw
1869  int encode = prefix_and_encode(dst->encoding(), src->encoding());
1870  emit_byte(0x0F);
1871  emit_byte(0xB7);
1872  emit_byte(0xC0 | encode);
1873}
1874
1875void Assembler::mull(Address src) {
1876  InstructionMark im(this);
1877  prefix(src);
1878  emit_byte(0xF7);
1879  emit_operand(rsp, src);
1880}
1881
1882void Assembler::mull(Register src) {
1883  int encode = prefix_and_encode(src->encoding());
1884  emit_byte(0xF7);
1885  emit_byte(0xE0 | encode);
1886}
1887
1888void Assembler::mulsd(XMMRegister dst, Address src) {
1889  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1890  emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
1891}
1892
1893void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
1894  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1895  emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
1896}
1897
1898void Assembler::mulss(XMMRegister dst, Address src) {
1899  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1900  emit_simd_arith(0x59, dst, src, VEX_SIMD_F3);
1901}
1902
1903void Assembler::mulss(XMMRegister dst, XMMRegister src) {
1904  NOT_LP64(assert(VM_Version::supports_sse(), ""));
1905  emit_simd_arith(0x59, dst, src, VEX_SIMD_F3);
1906}
1907
1908void Assembler::negl(Register dst) {
1909  int encode = prefix_and_encode(dst->encoding());
1910  emit_byte(0xF7);
1911  emit_byte(0xD8 | encode);
1912}
1913
1914void Assembler::nop(int i) {
1915#ifdef ASSERT
1916  assert(i > 0, " ");
1917  // The fancy nops aren't currently recognized by debuggers making it a
1918  // pain to disassemble code while debugging. If asserts are on clearly
1919  // speed is not an issue so simply use the single byte traditional nop
1920  // to do alignment.
1921
1922  for (; i > 0 ; i--) emit_byte(0x90);
1923  return;
1924
1925#endif // ASSERT
1926
1927  if (UseAddressNop && VM_Version::is_intel()) {
1928    //
1929    // Using multi-bytes nops "0x0F 0x1F [address]" for Intel
1930    //  1: 0x90
1931    //  2: 0x66 0x90
1932    //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
1933    //  4: 0x0F 0x1F 0x40 0x00
1934    //  5: 0x0F 0x1F 0x44 0x00 0x00
1935    //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
1936    //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
1937    //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1938    //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1939    // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1940    // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1941
1942    // The rest coding is Intel specific - don't use consecutive address nops
1943
1944    // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1945    // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1946    // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1947    // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1948
1949    while(i >= 15) {
1950      // For Intel don't generate consecutive addess nops (mix with regular nops)
1951      i -= 15;
1952      emit_byte(0x66);   // size prefix
1953      emit_byte(0x66);   // size prefix
1954      emit_byte(0x66);   // size prefix
1955      addr_nop_8();
1956      emit_byte(0x66);   // size prefix
1957      emit_byte(0x66);   // size prefix
1958      emit_byte(0x66);   // size prefix
1959      emit_byte(0x90);   // nop
1960    }
1961    switch (i) {
1962      case 14:
1963        emit_byte(0x66); // size prefix
1964      case 13:
1965        emit_byte(0x66); // size prefix
1966      case 12:
1967        addr_nop_8();
1968        emit_byte(0x66); // size prefix
1969        emit_byte(0x66); // size prefix
1970        emit_byte(0x66); // size prefix
1971        emit_byte(0x90); // nop
1972        break;
1973      case 11:
1974        emit_byte(0x66); // size prefix
1975      case 10:
1976        emit_byte(0x66); // size prefix
1977      case 9:
1978        emit_byte(0x66); // size prefix
1979      case 8:
1980        addr_nop_8();
1981        break;
1982      case 7:
1983        addr_nop_7();
1984        break;
1985      case 6:
1986        emit_byte(0x66); // size prefix
1987      case 5:
1988        addr_nop_5();
1989        break;
1990      case 4:
1991        addr_nop_4();
1992        break;
1993      case 3:
1994        // Don't use "0x0F 0x1F 0x00" - need patching safe padding
1995        emit_byte(0x66); // size prefix
1996      case 2:
1997        emit_byte(0x66); // size prefix
1998      case 1:
1999        emit_byte(0x90); // nop
2000        break;
2001      default:
2002        assert(i == 0, " ");
2003    }
2004    return;
2005  }
2006  if (UseAddressNop && VM_Version::is_amd()) {
2007    //
2008    // Using multi-bytes nops "0x0F 0x1F [address]" for AMD.
2009    //  1: 0x90
2010    //  2: 0x66 0x90
2011    //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2012    //  4: 0x0F 0x1F 0x40 0x00
2013    //  5: 0x0F 0x1F 0x44 0x00 0x00
2014    //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2015    //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2016    //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2017    //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2018    // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2019    // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2020
2021    // The rest coding is AMD specific - use consecutive address nops
2022
2023    // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2024    // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2025    // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2026    // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2027    // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2028    //     Size prefixes (0x66) are added for larger sizes
2029
2030    while(i >= 22) {
2031      i -= 11;
2032      emit_byte(0x66); // size prefix
2033      emit_byte(0x66); // size prefix
2034      emit_byte(0x66); // size prefix
2035      addr_nop_8();
2036    }
2037    // Generate first nop for size between 21-12
2038    switch (i) {
2039      case 21:
2040        i -= 1;
2041        emit_byte(0x66); // size prefix
2042      case 20:
2043      case 19:
2044        i -= 1;
2045        emit_byte(0x66); // size prefix
2046      case 18:
2047      case 17:
2048        i -= 1;
2049        emit_byte(0x66); // size prefix
2050      case 16:
2051      case 15:
2052        i -= 8;
2053        addr_nop_8();
2054        break;
2055      case 14:
2056      case 13:
2057        i -= 7;
2058        addr_nop_7();
2059        break;
2060      case 12:
2061        i -= 6;
2062        emit_byte(0x66); // size prefix
2063        addr_nop_5();
2064        break;
2065      default:
2066        assert(i < 12, " ");
2067    }
2068
2069    // Generate second nop for size between 11-1
2070    switch (i) {
2071      case 11:
2072        emit_byte(0x66); // size prefix
2073      case 10:
2074        emit_byte(0x66); // size prefix
2075      case 9:
2076        emit_byte(0x66); // size prefix
2077      case 8:
2078        addr_nop_8();
2079        break;
2080      case 7:
2081        addr_nop_7();
2082        break;
2083      case 6:
2084        emit_byte(0x66); // size prefix
2085      case 5:
2086        addr_nop_5();
2087        break;
2088      case 4:
2089        addr_nop_4();
2090        break;
2091      case 3:
2092        // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2093        emit_byte(0x66); // size prefix
2094      case 2:
2095        emit_byte(0x66); // size prefix
2096      case 1:
2097        emit_byte(0x90); // nop
2098        break;
2099      default:
2100        assert(i == 0, " ");
2101    }
2102    return;
2103  }
2104
2105  // Using nops with size prefixes "0x66 0x90".
2106  // From AMD Optimization Guide:
2107  //  1: 0x90
2108  //  2: 0x66 0x90
2109  //  3: 0x66 0x66 0x90
2110  //  4: 0x66 0x66 0x66 0x90
2111  //  5: 0x66 0x66 0x90 0x66 0x90
2112  //  6: 0x66 0x66 0x90 0x66 0x66 0x90
2113  //  7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2114  //  8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2115  //  9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2116  // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2117  //
2118  while(i > 12) {
2119    i -= 4;
2120    emit_byte(0x66); // size prefix
2121    emit_byte(0x66);
2122    emit_byte(0x66);
2123    emit_byte(0x90); // nop
2124  }
2125  // 1 - 12 nops
2126  if(i > 8) {
2127    if(i > 9) {
2128      i -= 1;
2129      emit_byte(0x66);
2130    }
2131    i -= 3;
2132    emit_byte(0x66);
2133    emit_byte(0x66);
2134    emit_byte(0x90);
2135  }
2136  // 1 - 8 nops
2137  if(i > 4) {
2138    if(i > 6) {
2139      i -= 1;
2140      emit_byte(0x66);
2141    }
2142    i -= 3;
2143    emit_byte(0x66);
2144    emit_byte(0x66);
2145    emit_byte(0x90);
2146  }
2147  switch (i) {
2148    case 4:
2149      emit_byte(0x66);
2150    case 3:
2151      emit_byte(0x66);
2152    case 2:
2153      emit_byte(0x66);
2154    case 1:
2155      emit_byte(0x90);
2156      break;
2157    default:
2158      assert(i == 0, " ");
2159  }
2160}
2161
2162void Assembler::notl(Register dst) {
2163  int encode = prefix_and_encode(dst->encoding());
2164  emit_byte(0xF7);
2165  emit_byte(0xD0 | encode );
2166}
2167
2168void Assembler::orl(Address dst, int32_t imm32) {
2169  InstructionMark im(this);
2170  prefix(dst);
2171  emit_arith_operand(0x81, rcx, dst, imm32);
2172}
2173
2174void Assembler::orl(Register dst, int32_t imm32) {
2175  prefix(dst);
2176  emit_arith(0x81, 0xC8, dst, imm32);
2177}
2178
2179void Assembler::orl(Register dst, Address src) {
2180  InstructionMark im(this);
2181  prefix(src, dst);
2182  emit_byte(0x0B);
2183  emit_operand(dst, src);
2184}
2185
2186void Assembler::orl(Register dst, Register src) {
2187  (void) prefix_and_encode(dst->encoding(), src->encoding());
2188  emit_arith(0x0B, 0xC0, dst, src);
2189}
2190
2191void Assembler::packuswb(XMMRegister dst, Address src) {
2192  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2193  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2194  emit_simd_arith(0x67, dst, src, VEX_SIMD_66);
2195}
2196
2197void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
2198  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2199  emit_simd_arith(0x67, dst, src, VEX_SIMD_66);
2200}
2201
2202void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2203  assert(VM_Version::supports_sse4_2(), "");
2204  InstructionMark im(this);
2205  simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
2206  emit_byte(0x61);
2207  emit_operand(dst, src);
2208  emit_byte(imm8);
2209}
2210
2211void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2212  assert(VM_Version::supports_sse4_2(), "");
2213  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
2214  emit_byte(0x61);
2215  emit_byte(0xC0 | encode);
2216  emit_byte(imm8);
2217}
2218
2219void Assembler::pmovzxbw(XMMRegister dst, Address src) {
2220  assert(VM_Version::supports_sse4_1(), "");
2221  InstructionMark im(this);
2222  simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2223  emit_byte(0x30);
2224  emit_operand(dst, src);
2225}
2226
2227void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2228  assert(VM_Version::supports_sse4_1(), "");
2229  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2230  emit_byte(0x30);
2231  emit_byte(0xC0 | encode);
2232}
2233
2234// generic
2235void Assembler::pop(Register dst) {
2236  int encode = prefix_and_encode(dst->encoding());
2237  emit_byte(0x58 | encode);
2238}
2239
2240void Assembler::popcntl(Register dst, Address src) {
2241  assert(VM_Version::supports_popcnt(), "must support");
2242  InstructionMark im(this);
2243  emit_byte(0xF3);
2244  prefix(src, dst);
2245  emit_byte(0x0F);
2246  emit_byte(0xB8);
2247  emit_operand(dst, src);
2248}
2249
2250void Assembler::popcntl(Register dst, Register src) {
2251  assert(VM_Version::supports_popcnt(), "must support");
2252  emit_byte(0xF3);
2253  int encode = prefix_and_encode(dst->encoding(), src->encoding());
2254  emit_byte(0x0F);
2255  emit_byte(0xB8);
2256  emit_byte(0xC0 | encode);
2257}
2258
2259void Assembler::popf() {
2260  emit_byte(0x9D);
2261}
2262
2263#ifndef _LP64 // no 32bit push/pop on amd64
2264void Assembler::popl(Address dst) {
2265  // NOTE: this will adjust stack by 8byte on 64bits
2266  InstructionMark im(this);
2267  prefix(dst);
2268  emit_byte(0x8F);
2269  emit_operand(rax, dst);
2270}
2271#endif
2272
2273void Assembler::prefetch_prefix(Address src) {
2274  prefix(src);
2275  emit_byte(0x0F);
2276}
2277
2278void Assembler::prefetchnta(Address src) {
2279  NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2280  InstructionMark im(this);
2281  prefetch_prefix(src);
2282  emit_byte(0x18);
2283  emit_operand(rax, src); // 0, src
2284}
2285
2286void Assembler::prefetchr(Address src) {
2287  assert(VM_Version::supports_3dnow_prefetch(), "must support");
2288  InstructionMark im(this);
2289  prefetch_prefix(src);
2290  emit_byte(0x0D);
2291  emit_operand(rax, src); // 0, src
2292}
2293
2294void Assembler::prefetcht0(Address src) {
2295  NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2296  InstructionMark im(this);
2297  prefetch_prefix(src);
2298  emit_byte(0x18);
2299  emit_operand(rcx, src); // 1, src
2300}
2301
2302void Assembler::prefetcht1(Address src) {
2303  NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2304  InstructionMark im(this);
2305  prefetch_prefix(src);
2306  emit_byte(0x18);
2307  emit_operand(rdx, src); // 2, src
2308}
2309
2310void Assembler::prefetcht2(Address src) {
2311  NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2312  InstructionMark im(this);
2313  prefetch_prefix(src);
2314  emit_byte(0x18);
2315  emit_operand(rbx, src); // 3, src
2316}
2317
2318void Assembler::prefetchw(Address src) {
2319  assert(VM_Version::supports_3dnow_prefetch(), "must support");
2320  InstructionMark im(this);
2321  prefetch_prefix(src);
2322  emit_byte(0x0D);
2323  emit_operand(rcx, src); // 1, src
2324}
2325
2326void Assembler::prefix(Prefix p) {
2327  a_byte(p);
2328}
2329
2330void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
2331  assert(isByte(mode), "invalid value");
2332  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2333  emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_66);
2334  emit_byte(mode & 0xFF);
2335
2336}
2337
2338void Assembler::pshufd(XMMRegister dst, Address src, int mode) {
2339  assert(isByte(mode), "invalid value");
2340  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2341  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2342  InstructionMark im(this);
2343  simd_prefix(dst, src, VEX_SIMD_66);
2344  emit_byte(0x70);
2345  emit_operand(dst, src);
2346  emit_byte(mode & 0xFF);
2347}
2348
2349void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
2350  assert(isByte(mode), "invalid value");
2351  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2352  emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2);
2353  emit_byte(mode & 0xFF);
2354}
2355
2356void Assembler::pshuflw(XMMRegister dst, Address src, int mode) {
2357  assert(isByte(mode), "invalid value");
2358  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2359  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2360  InstructionMark im(this);
2361  simd_prefix(dst, src, VEX_SIMD_F2);
2362  emit_byte(0x70);
2363  emit_operand(dst, src);
2364  emit_byte(mode & 0xFF);
2365}
2366
2367void Assembler::psrldq(XMMRegister dst, int shift) {
2368  // Shift 128 bit value in xmm register by number of bytes.
2369  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2370  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66);
2371  emit_byte(0x73);
2372  emit_byte(0xC0 | encode);
2373  emit_byte(shift);
2374}
2375
2376void Assembler::ptest(XMMRegister dst, Address src) {
2377  assert(VM_Version::supports_sse4_1(), "");
2378  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2379  InstructionMark im(this);
2380  simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2381  emit_byte(0x17);
2382  emit_operand(dst, src);
2383}
2384
2385void Assembler::ptest(XMMRegister dst, XMMRegister src) {
2386  assert(VM_Version::supports_sse4_1(), "");
2387  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2388  emit_byte(0x17);
2389  emit_byte(0xC0 | encode);
2390}
2391
2392void Assembler::punpcklbw(XMMRegister dst, Address src) {
2393  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2394  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2395  emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
2396}
2397
2398void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
2399  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2400  emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
2401}
2402
2403void Assembler::punpckldq(XMMRegister dst, Address src) {
2404  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2405  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2406  emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
2407}
2408
2409void Assembler::punpckldq(XMMRegister dst, XMMRegister src) {
2410  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2411  emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
2412}
2413
2414void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) {
2415  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2416  emit_simd_arith(0x6C, dst, src, VEX_SIMD_66);
2417}
2418
2419void Assembler::push(int32_t imm32) {
2420  // in 64bits we push 64bits onto the stack but only
2421  // take a 32bit immediate
2422  emit_byte(0x68);
2423  emit_long(imm32);
2424}
2425
2426void Assembler::push(Register src) {
2427  int encode = prefix_and_encode(src->encoding());
2428
2429  emit_byte(0x50 | encode);
2430}
2431
2432void Assembler::pushf() {
2433  emit_byte(0x9C);
2434}
2435
2436#ifndef _LP64 // no 32bit push/pop on amd64
2437void Assembler::pushl(Address src) {
2438  // Note this will push 64bit on 64bit
2439  InstructionMark im(this);
2440  prefix(src);
2441  emit_byte(0xFF);
2442  emit_operand(rsi, src);
2443}
2444#endif
2445
2446void Assembler::rcll(Register dst, int imm8) {
2447  assert(isShiftCount(imm8), "illegal shift count");
2448  int encode = prefix_and_encode(dst->encoding());
2449  if (imm8 == 1) {
2450    emit_byte(0xD1);
2451    emit_byte(0xD0 | encode);
2452  } else {
2453    emit_byte(0xC1);
2454    emit_byte(0xD0 | encode);
2455    emit_byte(imm8);
2456  }
2457}
2458
2459// copies data from [esi] to [edi] using rcx pointer sized words
2460// generic
2461void Assembler::rep_mov() {
2462  emit_byte(0xF3);
2463  // MOVSQ
2464  LP64_ONLY(prefix(REX_W));
2465  emit_byte(0xA5);
2466}
2467
2468// sets rcx pointer sized words with rax, value at [edi]
2469// generic
2470void Assembler::rep_set() { // rep_set
2471  emit_byte(0xF3);
2472  // STOSQ
2473  LP64_ONLY(prefix(REX_W));
2474  emit_byte(0xAB);
2475}
2476
2477// scans rcx pointer sized words at [edi] for occurance of rax,
2478// generic
2479void Assembler::repne_scan() { // repne_scan
2480  emit_byte(0xF2);
2481  // SCASQ
2482  LP64_ONLY(prefix(REX_W));
2483  emit_byte(0xAF);
2484}
2485
2486#ifdef _LP64
2487// scans rcx 4 byte words at [edi] for occurance of rax,
2488// generic
2489void Assembler::repne_scanl() { // repne_scan
2490  emit_byte(0xF2);
2491  // SCASL
2492  emit_byte(0xAF);
2493}
2494#endif
2495
2496void Assembler::ret(int imm16) {
2497  if (imm16 == 0) {
2498    emit_byte(0xC3);
2499  } else {
2500    emit_byte(0xC2);
2501    emit_word(imm16);
2502  }
2503}
2504
2505void Assembler::sahf() {
2506#ifdef _LP64
2507  // Not supported in 64bit mode
2508  ShouldNotReachHere();
2509#endif
2510  emit_byte(0x9E);
2511}
2512
2513void Assembler::sarl(Register dst, int imm8) {
2514  int encode = prefix_and_encode(dst->encoding());
2515  assert(isShiftCount(imm8), "illegal shift count");
2516  if (imm8 == 1) {
2517    emit_byte(0xD1);
2518    emit_byte(0xF8 | encode);
2519  } else {
2520    emit_byte(0xC1);
2521    emit_byte(0xF8 | encode);
2522    emit_byte(imm8);
2523  }
2524}
2525
2526void Assembler::sarl(Register dst) {
2527  int encode = prefix_and_encode(dst->encoding());
2528  emit_byte(0xD3);
2529  emit_byte(0xF8 | encode);
2530}
2531
2532void Assembler::sbbl(Address dst, int32_t imm32) {
2533  InstructionMark im(this);
2534  prefix(dst);
2535  emit_arith_operand(0x81, rbx, dst, imm32);
2536}
2537
2538void Assembler::sbbl(Register dst, int32_t imm32) {
2539  prefix(dst);
2540  emit_arith(0x81, 0xD8, dst, imm32);
2541}
2542
2543
2544void Assembler::sbbl(Register dst, Address src) {
2545  InstructionMark im(this);
2546  prefix(src, dst);
2547  emit_byte(0x1B);
2548  emit_operand(dst, src);
2549}
2550
2551void Assembler::sbbl(Register dst, Register src) {
2552  (void) prefix_and_encode(dst->encoding(), src->encoding());
2553  emit_arith(0x1B, 0xC0, dst, src);
2554}
2555
2556void Assembler::setb(Condition cc, Register dst) {
2557  assert(0 <= cc && cc < 16, "illegal cc");
2558  int encode = prefix_and_encode(dst->encoding(), true);
2559  emit_byte(0x0F);
2560  emit_byte(0x90 | cc);
2561  emit_byte(0xC0 | encode);
2562}
2563
2564void Assembler::shll(Register dst, int imm8) {
2565  assert(isShiftCount(imm8), "illegal shift count");
2566  int encode = prefix_and_encode(dst->encoding());
2567  if (imm8 == 1 ) {
2568    emit_byte(0xD1);
2569    emit_byte(0xE0 | encode);
2570  } else {
2571    emit_byte(0xC1);
2572    emit_byte(0xE0 | encode);
2573    emit_byte(imm8);
2574  }
2575}
2576
2577void Assembler::shll(Register dst) {
2578  int encode = prefix_and_encode(dst->encoding());
2579  emit_byte(0xD3);
2580  emit_byte(0xE0 | encode);
2581}
2582
2583void Assembler::shrl(Register dst, int imm8) {
2584  assert(isShiftCount(imm8), "illegal shift count");
2585  int encode = prefix_and_encode(dst->encoding());
2586  emit_byte(0xC1);
2587  emit_byte(0xE8 | encode);
2588  emit_byte(imm8);
2589}
2590
2591void Assembler::shrl(Register dst) {
2592  int encode = prefix_and_encode(dst->encoding());
2593  emit_byte(0xD3);
2594  emit_byte(0xE8 | encode);
2595}
2596
2597// copies a single word from [esi] to [edi]
2598void Assembler::smovl() {
2599  emit_byte(0xA5);
2600}
2601
2602void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
2603  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2604  emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
2605}
2606
2607void Assembler::sqrtsd(XMMRegister dst, Address src) {
2608  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2609  emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
2610}
2611
2612void Assembler::sqrtss(XMMRegister dst, XMMRegister src) {
2613  NOT_LP64(assert(VM_Version::supports_sse(), ""));
2614  emit_simd_arith(0x51, dst, src, VEX_SIMD_F3);
2615}
2616
2617void Assembler::sqrtss(XMMRegister dst, Address src) {
2618  NOT_LP64(assert(VM_Version::supports_sse(), ""));
2619  emit_simd_arith(0x51, dst, src, VEX_SIMD_F3);
2620}
2621
2622void Assembler::stmxcsr( Address dst) {
2623  NOT_LP64(assert(VM_Version::supports_sse(), ""));
2624  InstructionMark im(this);
2625  prefix(dst);
2626  emit_byte(0x0F);
2627  emit_byte(0xAE);
2628  emit_operand(as_Register(3), dst);
2629}
2630
2631void Assembler::subl(Address dst, int32_t imm32) {
2632  InstructionMark im(this);
2633  prefix(dst);
2634  emit_arith_operand(0x81, rbp, dst, imm32);
2635}
2636
2637void Assembler::subl(Address dst, Register src) {
2638  InstructionMark im(this);
2639  prefix(dst, src);
2640  emit_byte(0x29);
2641  emit_operand(src, dst);
2642}
2643
2644void Assembler::subl(Register dst, int32_t imm32) {
2645  prefix(dst);
2646  emit_arith(0x81, 0xE8, dst, imm32);
2647}
2648
2649// Force generation of a 4 byte immediate value even if it fits into 8bit
2650void Assembler::subl_imm32(Register dst, int32_t imm32) {
2651  prefix(dst);
2652  emit_arith_imm32(0x81, 0xE8, dst, imm32);
2653}
2654
2655void Assembler::subl(Register dst, Address src) {
2656  InstructionMark im(this);
2657  prefix(src, dst);
2658  emit_byte(0x2B);
2659  emit_operand(dst, src);
2660}
2661
2662void Assembler::subl(Register dst, Register src) {
2663  (void) prefix_and_encode(dst->encoding(), src->encoding());
2664  emit_arith(0x2B, 0xC0, dst, src);
2665}
2666
2667void Assembler::subsd(XMMRegister dst, XMMRegister src) {
2668  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2669  emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
2670}
2671
2672void Assembler::subsd(XMMRegister dst, Address src) {
2673  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2674  emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
2675}
2676
2677void Assembler::subss(XMMRegister dst, XMMRegister src) {
2678  NOT_LP64(assert(VM_Version::supports_sse(), ""));
2679  emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3);
2680}
2681
2682void Assembler::subss(XMMRegister dst, Address src) {
2683  NOT_LP64(assert(VM_Version::supports_sse(), ""));
2684  emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3);
2685}
2686
2687void Assembler::testb(Register dst, int imm8) {
2688  NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
2689  (void) prefix_and_encode(dst->encoding(), true);
2690  emit_arith_b(0xF6, 0xC0, dst, imm8);
2691}
2692
2693void Assembler::testl(Register dst, int32_t imm32) {
2694  // not using emit_arith because test
2695  // doesn't support sign-extension of
2696  // 8bit operands
2697  int encode = dst->encoding();
2698  if (encode == 0) {
2699    emit_byte(0xA9);
2700  } else {
2701    encode = prefix_and_encode(encode);
2702    emit_byte(0xF7);
2703    emit_byte(0xC0 | encode);
2704  }
2705  emit_long(imm32);
2706}
2707
2708void Assembler::testl(Register dst, Register src) {
2709  (void) prefix_and_encode(dst->encoding(), src->encoding());
2710  emit_arith(0x85, 0xC0, dst, src);
2711}
2712
2713void Assembler::testl(Register dst, Address  src) {
2714  InstructionMark im(this);
2715  prefix(src, dst);
2716  emit_byte(0x85);
2717  emit_operand(dst, src);
2718}
2719
2720void Assembler::ucomisd(XMMRegister dst, Address src) {
2721  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2722  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
2723}
2724
2725void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
2726  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2727  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
2728}
2729
2730void Assembler::ucomiss(XMMRegister dst, Address src) {
2731  NOT_LP64(assert(VM_Version::supports_sse(), ""));
2732  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE);
2733}
2734
2735void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
2736  NOT_LP64(assert(VM_Version::supports_sse(), ""));
2737  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE);
2738}
2739
2740
2741void Assembler::xaddl(Address dst, Register src) {
2742  InstructionMark im(this);
2743  prefix(dst, src);
2744  emit_byte(0x0F);
2745  emit_byte(0xC1);
2746  emit_operand(src, dst);
2747}
2748
2749void Assembler::xchgl(Register dst, Address src) { // xchg
2750  InstructionMark im(this);
2751  prefix(src, dst);
2752  emit_byte(0x87);
2753  emit_operand(dst, src);
2754}
2755
2756void Assembler::xchgl(Register dst, Register src) {
2757  int encode = prefix_and_encode(dst->encoding(), src->encoding());
2758  emit_byte(0x87);
2759  emit_byte(0xc0 | encode);
2760}
2761
2762void Assembler::xorl(Register dst, int32_t imm32) {
2763  prefix(dst);
2764  emit_arith(0x81, 0xF0, dst, imm32);
2765}
2766
2767void Assembler::xorl(Register dst, Address src) {
2768  InstructionMark im(this);
2769  prefix(src, dst);
2770  emit_byte(0x33);
2771  emit_operand(dst, src);
2772}
2773
2774void Assembler::xorl(Register dst, Register src) {
2775  (void) prefix_and_encode(dst->encoding(), src->encoding());
2776  emit_arith(0x33, 0xC0, dst, src);
2777}
2778
2779
2780// AVX 3-operands scalar float-point arithmetic instructions
2781
2782void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
2783  assert(VM_Version::supports_avx(), "");
2784  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2785}
2786
2787void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2788  assert(VM_Version::supports_avx(), "");
2789  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2790}
2791
2792void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
2793  assert(VM_Version::supports_avx(), "");
2794  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2795}
2796
2797void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2798  assert(VM_Version::supports_avx(), "");
2799  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2800}
2801
2802void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
2803  assert(VM_Version::supports_avx(), "");
2804  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2805}
2806
2807void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2808  assert(VM_Version::supports_avx(), "");
2809  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2810}
2811
2812void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
2813  assert(VM_Version::supports_avx(), "");
2814  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2815}
2816
2817void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2818  assert(VM_Version::supports_avx(), "");
2819  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2820}
2821
2822void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
2823  assert(VM_Version::supports_avx(), "");
2824  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2825}
2826
2827void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2828  assert(VM_Version::supports_avx(), "");
2829  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2830}
2831
2832void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
2833  assert(VM_Version::supports_avx(), "");
2834  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2835}
2836
2837void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2838  assert(VM_Version::supports_avx(), "");
2839  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2840}
2841
2842void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
2843  assert(VM_Version::supports_avx(), "");
2844  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2845}
2846
2847void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2848  assert(VM_Version::supports_avx(), "");
2849  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2850}
2851
2852void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
2853  assert(VM_Version::supports_avx(), "");
2854  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2855}
2856
2857void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2858  assert(VM_Version::supports_avx(), "");
2859  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2860}
2861
2862//====================VECTOR ARITHMETIC=====================================
2863
2864// Float-point vector arithmetic
2865
2866void Assembler::addpd(XMMRegister dst, XMMRegister src) {
2867  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2868  emit_simd_arith(0x58, dst, src, VEX_SIMD_66);
2869}
2870
2871void Assembler::addps(XMMRegister dst, XMMRegister src) {
2872  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2873  emit_simd_arith(0x58, dst, src, VEX_SIMD_NONE);
2874}
2875
2876void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2877  assert(VM_Version::supports_avx(), "");
2878  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256);
2879}
2880
2881void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2882  assert(VM_Version::supports_avx(), "");
2883  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256);
2884}
2885
2886void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2887  assert(VM_Version::supports_avx(), "");
2888  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256);
2889}
2890
2891void Assembler::vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2892  assert(VM_Version::supports_avx(), "");
2893  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256);
2894}
2895
2896void Assembler::subpd(XMMRegister dst, XMMRegister src) {
2897  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2898  emit_simd_arith(0x5C, dst, src, VEX_SIMD_66);
2899}
2900
2901void Assembler::subps(XMMRegister dst, XMMRegister src) {
2902  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2903  emit_simd_arith(0x5C, dst, src, VEX_SIMD_NONE);
2904}
2905
2906void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2907  assert(VM_Version::supports_avx(), "");
2908  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256);
2909}
2910
2911void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2912  assert(VM_Version::supports_avx(), "");
2913  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256);
2914}
2915
2916void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2917  assert(VM_Version::supports_avx(), "");
2918  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256);
2919}
2920
2921void Assembler::vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2922  assert(VM_Version::supports_avx(), "");
2923  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256);
2924}
2925
2926void Assembler::mulpd(XMMRegister dst, XMMRegister src) {
2927  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2928  emit_simd_arith(0x59, dst, src, VEX_SIMD_66);
2929}
2930
2931void Assembler::mulps(XMMRegister dst, XMMRegister src) {
2932  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2933  emit_simd_arith(0x59, dst, src, VEX_SIMD_NONE);
2934}
2935
2936void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2937  assert(VM_Version::supports_avx(), "");
2938  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256);
2939}
2940
2941void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2942  assert(VM_Version::supports_avx(), "");
2943  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256);
2944}
2945
2946void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2947  assert(VM_Version::supports_avx(), "");
2948  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256);
2949}
2950
2951void Assembler::vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2952  assert(VM_Version::supports_avx(), "");
2953  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256);
2954}
2955
2956void Assembler::divpd(XMMRegister dst, XMMRegister src) {
2957  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2958  emit_simd_arith(0x5E, dst, src, VEX_SIMD_66);
2959}
2960
2961void Assembler::divps(XMMRegister dst, XMMRegister src) {
2962  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2963  emit_simd_arith(0x5E, dst, src, VEX_SIMD_NONE);
2964}
2965
2966void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2967  assert(VM_Version::supports_avx(), "");
2968  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256);
2969}
2970
2971void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2972  assert(VM_Version::supports_avx(), "");
2973  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256);
2974}
2975
2976void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2977  assert(VM_Version::supports_avx(), "");
2978  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256);
2979}
2980
2981void Assembler::vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2982  assert(VM_Version::supports_avx(), "");
2983  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256);
2984}
2985
2986void Assembler::andpd(XMMRegister dst, XMMRegister src) {
2987  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2988  emit_simd_arith(0x54, dst, src, VEX_SIMD_66);
2989}
2990
2991void Assembler::andps(XMMRegister dst, XMMRegister src) {
2992  NOT_LP64(assert(VM_Version::supports_sse(), ""));
2993  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE);
2994}
2995
2996void Assembler::andps(XMMRegister dst, Address src) {
2997  NOT_LP64(assert(VM_Version::supports_sse(), ""));
2998  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE);
2999}
3000
3001void Assembler::andpd(XMMRegister dst, Address src) {
3002  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3003  emit_simd_arith(0x54, dst, src, VEX_SIMD_66);
3004}
3005
3006void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3007  assert(VM_Version::supports_avx(), "");
3008  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256);
3009}
3010
3011void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3012  assert(VM_Version::supports_avx(), "");
3013  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256);
3014}
3015
3016void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3017  assert(VM_Version::supports_avx(), "");
3018  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256);
3019}
3020
3021void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3022  assert(VM_Version::supports_avx(), "");
3023  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256);
3024}
3025
3026void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
3027  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3028  emit_simd_arith(0x57, dst, src, VEX_SIMD_66);
3029}
3030
3031void Assembler::xorps(XMMRegister dst, XMMRegister src) {
3032  NOT_LP64(assert(VM_Version::supports_sse(), ""));
3033  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE);
3034}
3035
3036void Assembler::xorpd(XMMRegister dst, Address src) {
3037  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3038  emit_simd_arith(0x57, dst, src, VEX_SIMD_66);
3039}
3040
3041void Assembler::xorps(XMMRegister dst, Address src) {
3042  NOT_LP64(assert(VM_Version::supports_sse(), ""));
3043  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE);
3044}
3045
3046void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3047  assert(VM_Version::supports_avx(), "");
3048  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256);
3049}
3050
3051void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3052  assert(VM_Version::supports_avx(), "");
3053  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256);
3054}
3055
3056void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3057  assert(VM_Version::supports_avx(), "");
3058  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256);
3059}
3060
3061void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3062  assert(VM_Version::supports_avx(), "");
3063  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256);
3064}
3065
3066
3067// Integer vector arithmetic
3068void Assembler::paddb(XMMRegister dst, XMMRegister src) {
3069  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3070  emit_simd_arith(0xFC, dst, src, VEX_SIMD_66);
3071}
3072
3073void Assembler::paddw(XMMRegister dst, XMMRegister src) {
3074  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3075  emit_simd_arith(0xFD, dst, src, VEX_SIMD_66);
3076}
3077
3078void Assembler::paddd(XMMRegister dst, XMMRegister src) {
3079  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3080  emit_simd_arith(0xFE, dst, src, VEX_SIMD_66);
3081}
3082
3083void Assembler::paddq(XMMRegister dst, XMMRegister src) {
3084  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3085  emit_simd_arith(0xD4, dst, src, VEX_SIMD_66);
3086}
3087
3088void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3089  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3090  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256);
3091}
3092
3093void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3094  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3095  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256);
3096}
3097
3098void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3099  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3100  emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256);
3101}
3102
3103void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3104  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3105  emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256);
3106}
3107
3108void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3109  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3110  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256);
3111}
3112
3113void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3114  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3115  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256);
3116}
3117
3118void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3119  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3120  emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256);
3121}
3122
3123void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3124  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3125  emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256);
3126}
3127
3128void Assembler::psubb(XMMRegister dst, XMMRegister src) {
3129  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3130  emit_simd_arith(0xF8, dst, src, VEX_SIMD_66);
3131}
3132
3133void Assembler::psubw(XMMRegister dst, XMMRegister src) {
3134  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3135  emit_simd_arith(0xF9, dst, src, VEX_SIMD_66);
3136}
3137
3138void Assembler::psubd(XMMRegister dst, XMMRegister src) {
3139  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3140  emit_simd_arith(0xFA, dst, src, VEX_SIMD_66);
3141}
3142
3143void Assembler::psubq(XMMRegister dst, XMMRegister src) {
3144  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3145  emit_simd_arith(0xFB, dst, src, VEX_SIMD_66);
3146}
3147
3148void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3149  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3150  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256);
3151}
3152
3153void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3154  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3155  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256);
3156}
3157
3158void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3159  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3160  emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256);
3161}
3162
3163void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3164  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3165  emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256);
3166}
3167
3168void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3169  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3170  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256);
3171}
3172
3173void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3174  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3175  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256);
3176}
3177
3178void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3179  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3180  emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256);
3181}
3182
3183void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3184  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3185  emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256);
3186}
3187
3188void Assembler::pmullw(XMMRegister dst, XMMRegister src) {
3189  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3190  emit_simd_arith(0xD5, dst, src, VEX_SIMD_66);
3191}
3192
3193void Assembler::pmulld(XMMRegister dst, XMMRegister src) {
3194  assert(VM_Version::supports_sse4_1(), "");
3195  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
3196  emit_byte(0x40);
3197  emit_byte(0xC0 | encode);
3198}
3199
3200void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3201  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3202  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256);
3203}
3204
3205void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3206  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3207  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
3208  emit_byte(0x40);
3209  emit_byte(0xC0 | encode);
3210}
3211
3212void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3213  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3214  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256);
3215}
3216
3217void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3218  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3219  InstructionMark im(this);
3220  int dst_enc = dst->encoding();
3221  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
3222  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector256);
3223  emit_byte(0x40);
3224  emit_operand(dst, src);
3225}
3226
3227// Shift packed integers left by specified number of bits.
3228void Assembler::psllw(XMMRegister dst, int shift) {
3229  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3230  // XMM6 is for /6 encoding: 66 0F 71 /6 ib
3231  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
3232  emit_byte(0x71);
3233  emit_byte(0xC0 | encode);
3234  emit_byte(shift & 0xFF);
3235}
3236
3237void Assembler::pslld(XMMRegister dst, int shift) {
3238  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3239  // XMM6 is for /6 encoding: 66 0F 72 /6 ib
3240  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
3241  emit_byte(0x72);
3242  emit_byte(0xC0 | encode);
3243  emit_byte(shift & 0xFF);
3244}
3245
3246void Assembler::psllq(XMMRegister dst, int shift) {
3247  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3248  // XMM6 is for /6 encoding: 66 0F 73 /6 ib
3249  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
3250  emit_byte(0x73);
3251  emit_byte(0xC0 | encode);
3252  emit_byte(shift & 0xFF);
3253}
3254
3255void Assembler::psllw(XMMRegister dst, XMMRegister shift) {
3256  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3257  emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66);
3258}
3259
3260void Assembler::pslld(XMMRegister dst, XMMRegister shift) {
3261  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3262  emit_simd_arith(0xF2, dst, shift, VEX_SIMD_66);
3263}
3264
3265void Assembler::psllq(XMMRegister dst, XMMRegister shift) {
3266  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3267  emit_simd_arith(0xF3, dst, shift, VEX_SIMD_66);
3268}
3269
3270void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3271  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3272  // XMM6 is for /6 encoding: 66 0F 71 /6 ib
3273  emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector256);
3274  emit_byte(shift & 0xFF);
3275}
3276
3277void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3278  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3279  // XMM6 is for /6 encoding: 66 0F 72 /6 ib
3280  emit_vex_arith(0x72, xmm6, dst, src, VEX_SIMD_66, vector256);
3281  emit_byte(shift & 0xFF);
3282}
3283
3284void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3285  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3286  // XMM6 is for /6 encoding: 66 0F 73 /6 ib
3287  emit_vex_arith(0x73, xmm6, dst, src, VEX_SIMD_66, vector256);
3288  emit_byte(shift & 0xFF);
3289}
3290
3291void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3292  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3293  emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector256);
3294}
3295
3296void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3297  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3298  emit_vex_arith(0xF2, dst, src, shift, VEX_SIMD_66, vector256);
3299}
3300
3301void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3302  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3303  emit_vex_arith(0xF3, dst, src, shift, VEX_SIMD_66, vector256);
3304}
3305
3306// Shift packed integers logically right by specified number of bits.
3307void Assembler::psrlw(XMMRegister dst, int shift) {
3308  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3309  // XMM2 is for /2 encoding: 66 0F 71 /2 ib
3310  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
3311  emit_byte(0x71);
3312  emit_byte(0xC0 | encode);
3313  emit_byte(shift & 0xFF);
3314}
3315
3316void Assembler::psrld(XMMRegister dst, int shift) {
3317  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3318  // XMM2 is for /2 encoding: 66 0F 72 /2 ib
3319  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
3320  emit_byte(0x72);
3321  emit_byte(0xC0 | encode);
3322  emit_byte(shift & 0xFF);
3323}
3324
3325void Assembler::psrlq(XMMRegister dst, int shift) {
3326  // Do not confuse it with psrldq SSE2 instruction which
3327  // shifts 128 bit value in xmm register by number of bytes.
3328  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3329  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
3330  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
3331  emit_byte(0x73);
3332  emit_byte(0xC0 | encode);
3333  emit_byte(shift & 0xFF);
3334}
3335
3336void Assembler::psrlw(XMMRegister dst, XMMRegister shift) {
3337  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3338  emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66);
3339}
3340
3341void Assembler::psrld(XMMRegister dst, XMMRegister shift) {
3342  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3343  emit_simd_arith(0xD2, dst, shift, VEX_SIMD_66);
3344}
3345
3346void Assembler::psrlq(XMMRegister dst, XMMRegister shift) {
3347  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3348  emit_simd_arith(0xD3, dst, shift, VEX_SIMD_66);
3349}
3350
3351void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3352  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3353  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
3354  emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector256);
3355  emit_byte(shift & 0xFF);
3356}
3357
3358void Assembler::vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3359  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3360  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
3361  emit_vex_arith(0x72, xmm2, dst, src, VEX_SIMD_66, vector256);
3362  emit_byte(shift & 0xFF);
3363}
3364
3365void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3366  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3367  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
3368  emit_vex_arith(0x73, xmm2, dst, src, VEX_SIMD_66, vector256);
3369  emit_byte(shift & 0xFF);
3370}
3371
3372void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3373  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3374  emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector256);
3375}
3376
3377void Assembler::vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3378  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3379  emit_vex_arith(0xD2, dst, src, shift, VEX_SIMD_66, vector256);
3380}
3381
3382void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3383  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3384  emit_vex_arith(0xD3, dst, src, shift, VEX_SIMD_66, vector256);
3385}
3386
3387// Shift packed integers arithmetically right by specified number of bits.
3388void Assembler::psraw(XMMRegister dst, int shift) {
3389  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3390  // XMM4 is for /4 encoding: 66 0F 71 /4 ib
3391  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66);
3392  emit_byte(0x71);
3393  emit_byte(0xC0 | encode);
3394  emit_byte(shift & 0xFF);
3395}
3396
3397void Assembler::psrad(XMMRegister dst, int shift) {
3398  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3399  // XMM4 is for /4 encoding: 66 0F 72 /4 ib
3400  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66);
3401  emit_byte(0x72);
3402  emit_byte(0xC0 | encode);
3403  emit_byte(shift & 0xFF);
3404}
3405
3406void Assembler::psraw(XMMRegister dst, XMMRegister shift) {
3407  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3408  emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66);
3409}
3410
3411void Assembler::psrad(XMMRegister dst, XMMRegister shift) {
3412  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3413  emit_simd_arith(0xE2, dst, shift, VEX_SIMD_66);
3414}
3415
3416void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3417  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3418  // XMM4 is for /4 encoding: 66 0F 71 /4 ib
3419  emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector256);
3420  emit_byte(shift & 0xFF);
3421}
3422
3423void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3424  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3425  // XMM4 is for /4 encoding: 66 0F 71 /4 ib
3426  emit_vex_arith(0x72, xmm4, dst, src, VEX_SIMD_66, vector256);
3427  emit_byte(shift & 0xFF);
3428}
3429
3430void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3431  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3432  emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector256);
3433}
3434
3435void Assembler::vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3436  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3437  emit_vex_arith(0xE2, dst, src, shift, VEX_SIMD_66, vector256);
3438}
3439
3440
3441// AND packed integers
3442void Assembler::pand(XMMRegister dst, XMMRegister src) {
3443  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3444  emit_simd_arith(0xDB, dst, src, VEX_SIMD_66);
3445}
3446
3447void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3448  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3449  emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256);
3450}
3451
3452void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3453  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3454  emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256);
3455}
3456
3457void Assembler::por(XMMRegister dst, XMMRegister src) {
3458  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3459  emit_simd_arith(0xEB, dst, src, VEX_SIMD_66);
3460}
3461
3462void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3463  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3464  emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256);
3465}
3466
3467void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3468  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3469  emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256);
3470}
3471
3472void Assembler::pxor(XMMRegister dst, XMMRegister src) {
3473  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3474  emit_simd_arith(0xEF, dst, src, VEX_SIMD_66);
3475}
3476
3477void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3478  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3479  emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256);
3480}
3481
3482void Assembler::vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3483  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3484  emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256);
3485}
3486
3487
3488void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3489  assert(VM_Version::supports_avx(), "");
3490  bool vector256 = true;
3491  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
3492  emit_byte(0x18);
3493  emit_byte(0xC0 | encode);
3494  // 0x00 - insert into lower 128 bits
3495  // 0x01 - insert into upper 128 bits
3496  emit_byte(0x01);
3497}
3498
3499void Assembler::vinsertf128h(XMMRegister dst, Address src) {
3500  assert(VM_Version::supports_avx(), "");
3501  InstructionMark im(this);
3502  bool vector256 = true;
3503  assert(dst != xnoreg, "sanity");
3504  int dst_enc = dst->encoding();
3505  // swap src<->dst for encoding
3506  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
3507  emit_byte(0x18);
3508  emit_operand(dst, src);
3509  // 0x01 - insert into upper 128 bits
3510  emit_byte(0x01);
3511}
3512
3513void Assembler::vextractf128h(Address dst, XMMRegister src) {
3514  assert(VM_Version::supports_avx(), "");
3515  InstructionMark im(this);
3516  bool vector256 = true;
3517  assert(src != xnoreg, "sanity");
3518  int src_enc = src->encoding();
3519  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
3520  emit_byte(0x19);
3521  emit_operand(src, dst);
3522  // 0x01 - extract from upper 128 bits
3523  emit_byte(0x01);
3524}
3525
3526void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3527  assert(VM_Version::supports_avx2(), "");
3528  bool vector256 = true;
3529  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
3530  emit_byte(0x38);
3531  emit_byte(0xC0 | encode);
3532  // 0x00 - insert into lower 128 bits
3533  // 0x01 - insert into upper 128 bits
3534  emit_byte(0x01);
3535}
3536
3537void Assembler::vinserti128h(XMMRegister dst, Address src) {
3538  assert(VM_Version::supports_avx2(), "");
3539  InstructionMark im(this);
3540  bool vector256 = true;
3541  assert(dst != xnoreg, "sanity");
3542  int dst_enc = dst->encoding();
3543  // swap src<->dst for encoding
3544  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
3545  emit_byte(0x38);
3546  emit_operand(dst, src);
3547  // 0x01 - insert into upper 128 bits
3548  emit_byte(0x01);
3549}
3550
3551void Assembler::vextracti128h(Address dst, XMMRegister src) {
3552  assert(VM_Version::supports_avx2(), "");
3553  InstructionMark im(this);
3554  bool vector256 = true;
3555  assert(src != xnoreg, "sanity");
3556  int src_enc = src->encoding();
3557  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
3558  emit_byte(0x39);
3559  emit_operand(src, dst);
3560  // 0x01 - extract from upper 128 bits
3561  emit_byte(0x01);
3562}
3563
3564void Assembler::vzeroupper() {
3565  assert(VM_Version::supports_avx(), "");
3566  (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
3567  emit_byte(0x77);
3568}
3569
3570
3571#ifndef _LP64
3572// 32bit only pieces of the assembler
3573
3574void Assembler::cmp_literal32(Register src1, int32_t imm32, RelocationHolder const& rspec) {
3575  // NO PREFIX AS NEVER 64BIT
3576  InstructionMark im(this);
3577  emit_byte(0x81);
3578  emit_byte(0xF8 | src1->encoding());
3579  emit_data(imm32, rspec, 0);
3580}
3581
3582void Assembler::cmp_literal32(Address src1, int32_t imm32, RelocationHolder const& rspec) {
3583  // NO PREFIX AS NEVER 64BIT (not even 32bit versions of 64bit regs
3584  InstructionMark im(this);
3585  emit_byte(0x81);
3586  emit_operand(rdi, src1);
3587  emit_data(imm32, rspec, 0);
3588}
3589
3590// The 64-bit (32bit platform) cmpxchg compares the value at adr with the contents of rdx:rax,
3591// and stores rcx:rbx into adr if so; otherwise, the value at adr is loaded
3592// into rdx:rax.  The ZF is set if the compared values were equal, and cleared otherwise.
3593void Assembler::cmpxchg8(Address adr) {
3594  InstructionMark im(this);
3595  emit_byte(0x0F);
3596  emit_byte(0xc7);
3597  emit_operand(rcx, adr);
3598}
3599
3600void Assembler::decl(Register dst) {
3601  // Don't use it directly. Use MacroAssembler::decrementl() instead.
3602 emit_byte(0x48 | dst->encoding());
3603}
3604
3605#endif // _LP64
3606
3607// 64bit typically doesn't use the x87 but needs to for the trig funcs
3608
3609void Assembler::fabs() {
3610  emit_byte(0xD9);
3611  emit_byte(0xE1);
3612}
3613
3614void Assembler::fadd(int i) {
3615  emit_farith(0xD8, 0xC0, i);
3616}
3617
3618void Assembler::fadd_d(Address src) {
3619  InstructionMark im(this);
3620  emit_byte(0xDC);
3621  emit_operand32(rax, src);
3622}
3623
3624void Assembler::fadd_s(Address src) {
3625  InstructionMark im(this);
3626  emit_byte(0xD8);
3627  emit_operand32(rax, src);
3628}
3629
3630void Assembler::fadda(int i) {
3631  emit_farith(0xDC, 0xC0, i);
3632}
3633
3634void Assembler::faddp(int i) {
3635  emit_farith(0xDE, 0xC0, i);
3636}
3637
3638void Assembler::fchs() {
3639  emit_byte(0xD9);
3640  emit_byte(0xE0);
3641}
3642
3643void Assembler::fcom(int i) {
3644  emit_farith(0xD8, 0xD0, i);
3645}
3646
3647void Assembler::fcomp(int i) {
3648  emit_farith(0xD8, 0xD8, i);
3649}
3650
3651void Assembler::fcomp_d(Address src) {
3652  InstructionMark im(this);
3653  emit_byte(0xDC);
3654  emit_operand32(rbx, src);
3655}
3656
3657void Assembler::fcomp_s(Address src) {
3658  InstructionMark im(this);
3659  emit_byte(0xD8);
3660  emit_operand32(rbx, src);
3661}
3662
3663void Assembler::fcompp() {
3664  emit_byte(0xDE);
3665  emit_byte(0xD9);
3666}
3667
3668void Assembler::fcos() {
3669  emit_byte(0xD9);
3670  emit_byte(0xFF);
3671}
3672
3673void Assembler::fdecstp() {
3674  emit_byte(0xD9);
3675  emit_byte(0xF6);
3676}
3677
3678void Assembler::fdiv(int i) {
3679  emit_farith(0xD8, 0xF0, i);
3680}
3681
3682void Assembler::fdiv_d(Address src) {
3683  InstructionMark im(this);
3684  emit_byte(0xDC);
3685  emit_operand32(rsi, src);
3686}
3687
3688void Assembler::fdiv_s(Address src) {
3689  InstructionMark im(this);
3690  emit_byte(0xD8);
3691  emit_operand32(rsi, src);
3692}
3693
3694void Assembler::fdiva(int i) {
3695  emit_farith(0xDC, 0xF8, i);
3696}
3697
3698// Note: The Intel manual (Pentium Processor User's Manual, Vol.3, 1994)
3699//       is erroneous for some of the floating-point instructions below.
3700
3701void Assembler::fdivp(int i) {
3702  emit_farith(0xDE, 0xF8, i);                    // ST(0) <- ST(0) / ST(1) and pop (Intel manual wrong)
3703}
3704
3705void Assembler::fdivr(int i) {
3706  emit_farith(0xD8, 0xF8, i);
3707}
3708
3709void Assembler::fdivr_d(Address src) {
3710  InstructionMark im(this);
3711  emit_byte(0xDC);
3712  emit_operand32(rdi, src);
3713}
3714
3715void Assembler::fdivr_s(Address src) {
3716  InstructionMark im(this);
3717  emit_byte(0xD8);
3718  emit_operand32(rdi, src);
3719}
3720
3721void Assembler::fdivra(int i) {
3722  emit_farith(0xDC, 0xF0, i);
3723}
3724
3725void Assembler::fdivrp(int i) {
3726  emit_farith(0xDE, 0xF0, i);                    // ST(0) <- ST(1) / ST(0) and pop (Intel manual wrong)
3727}
3728
3729void Assembler::ffree(int i) {
3730  emit_farith(0xDD, 0xC0, i);
3731}
3732
3733void Assembler::fild_d(Address adr) {
3734  InstructionMark im(this);
3735  emit_byte(0xDF);
3736  emit_operand32(rbp, adr);
3737}
3738
3739void Assembler::fild_s(Address adr) {
3740  InstructionMark im(this);
3741  emit_byte(0xDB);
3742  emit_operand32(rax, adr);
3743}
3744
3745void Assembler::fincstp() {
3746  emit_byte(0xD9);
3747  emit_byte(0xF7);
3748}
3749
3750void Assembler::finit() {
3751  emit_byte(0x9B);
3752  emit_byte(0xDB);
3753  emit_byte(0xE3);
3754}
3755
3756void Assembler::fist_s(Address adr) {
3757  InstructionMark im(this);
3758  emit_byte(0xDB);
3759  emit_operand32(rdx, adr);
3760}
3761
3762void Assembler::fistp_d(Address adr) {
3763  InstructionMark im(this);
3764  emit_byte(0xDF);
3765  emit_operand32(rdi, adr);
3766}
3767
3768void Assembler::fistp_s(Address adr) {
3769  InstructionMark im(this);
3770  emit_byte(0xDB);
3771  emit_operand32(rbx, adr);
3772}
3773
3774void Assembler::fld1() {
3775  emit_byte(0xD9);
3776  emit_byte(0xE8);
3777}
3778
3779void Assembler::fld_d(Address adr) {
3780  InstructionMark im(this);
3781  emit_byte(0xDD);
3782  emit_operand32(rax, adr);
3783}
3784
3785void Assembler::fld_s(Address adr) {
3786  InstructionMark im(this);
3787  emit_byte(0xD9);
3788  emit_operand32(rax, adr);
3789}
3790
3791
3792void Assembler::fld_s(int index) {
3793  emit_farith(0xD9, 0xC0, index);
3794}
3795
3796void Assembler::fld_x(Address adr) {
3797  InstructionMark im(this);
3798  emit_byte(0xDB);
3799  emit_operand32(rbp, adr);
3800}
3801
3802void Assembler::fldcw(Address src) {
3803  InstructionMark im(this);
3804  emit_byte(0xd9);
3805  emit_operand32(rbp, src);
3806}
3807
3808void Assembler::fldenv(Address src) {
3809  InstructionMark im(this);
3810  emit_byte(0xD9);
3811  emit_operand32(rsp, src);
3812}
3813
3814void Assembler::fldlg2() {
3815  emit_byte(0xD9);
3816  emit_byte(0xEC);
3817}
3818
3819void Assembler::fldln2() {
3820  emit_byte(0xD9);
3821  emit_byte(0xED);
3822}
3823
3824void Assembler::fldz() {
3825  emit_byte(0xD9);
3826  emit_byte(0xEE);
3827}
3828
3829void Assembler::flog() {
3830  fldln2();
3831  fxch();
3832  fyl2x();
3833}
3834
3835void Assembler::flog10() {
3836  fldlg2();
3837  fxch();
3838  fyl2x();
3839}
3840
3841void Assembler::fmul(int i) {
3842  emit_farith(0xD8, 0xC8, i);
3843}
3844
3845void Assembler::fmul_d(Address src) {
3846  InstructionMark im(this);
3847  emit_byte(0xDC);
3848  emit_operand32(rcx, src);
3849}
3850
3851void Assembler::fmul_s(Address src) {
3852  InstructionMark im(this);
3853  emit_byte(0xD8);
3854  emit_operand32(rcx, src);
3855}
3856
3857void Assembler::fmula(int i) {
3858  emit_farith(0xDC, 0xC8, i);
3859}
3860
3861void Assembler::fmulp(int i) {
3862  emit_farith(0xDE, 0xC8, i);
3863}
3864
3865void Assembler::fnsave(Address dst) {
3866  InstructionMark im(this);
3867  emit_byte(0xDD);
3868  emit_operand32(rsi, dst);
3869}
3870
3871void Assembler::fnstcw(Address src) {
3872  InstructionMark im(this);
3873  emit_byte(0x9B);
3874  emit_byte(0xD9);
3875  emit_operand32(rdi, src);
3876}
3877
3878void Assembler::fnstsw_ax() {
3879  emit_byte(0xdF);
3880  emit_byte(0xE0);
3881}
3882
3883void Assembler::fprem() {
3884  emit_byte(0xD9);
3885  emit_byte(0xF8);
3886}
3887
3888void Assembler::fprem1() {
3889  emit_byte(0xD9);
3890  emit_byte(0xF5);
3891}
3892
3893void Assembler::frstor(Address src) {
3894  InstructionMark im(this);
3895  emit_byte(0xDD);
3896  emit_operand32(rsp, src);
3897}
3898
3899void Assembler::fsin() {
3900  emit_byte(0xD9);
3901  emit_byte(0xFE);
3902}
3903
3904void Assembler::fsqrt() {
3905  emit_byte(0xD9);
3906  emit_byte(0xFA);
3907}
3908
3909void Assembler::fst_d(Address adr) {
3910  InstructionMark im(this);
3911  emit_byte(0xDD);
3912  emit_operand32(rdx, adr);
3913}
3914
3915void Assembler::fst_s(Address adr) {
3916  InstructionMark im(this);
3917  emit_byte(0xD9);
3918  emit_operand32(rdx, adr);
3919}
3920
3921void Assembler::fstp_d(Address adr) {
3922  InstructionMark im(this);
3923  emit_byte(0xDD);
3924  emit_operand32(rbx, adr);
3925}
3926
3927void Assembler::fstp_d(int index) {
3928  emit_farith(0xDD, 0xD8, index);
3929}
3930
3931void Assembler::fstp_s(Address adr) {
3932  InstructionMark im(this);
3933  emit_byte(0xD9);
3934  emit_operand32(rbx, adr);
3935}
3936
3937void Assembler::fstp_x(Address adr) {
3938  InstructionMark im(this);
3939  emit_byte(0xDB);
3940  emit_operand32(rdi, adr);
3941}
3942
3943void Assembler::fsub(int i) {
3944  emit_farith(0xD8, 0xE0, i);
3945}
3946
3947void Assembler::fsub_d(Address src) {
3948  InstructionMark im(this);
3949  emit_byte(0xDC);
3950  emit_operand32(rsp, src);
3951}
3952
3953void Assembler::fsub_s(Address src) {
3954  InstructionMark im(this);
3955  emit_byte(0xD8);
3956  emit_operand32(rsp, src);
3957}
3958
3959void Assembler::fsuba(int i) {
3960  emit_farith(0xDC, 0xE8, i);
3961}
3962
3963void Assembler::fsubp(int i) {
3964  emit_farith(0xDE, 0xE8, i);                    // ST(0) <- ST(0) - ST(1) and pop (Intel manual wrong)
3965}
3966
3967void Assembler::fsubr(int i) {
3968  emit_farith(0xD8, 0xE8, i);
3969}
3970
3971void Assembler::fsubr_d(Address src) {
3972  InstructionMark im(this);
3973  emit_byte(0xDC);
3974  emit_operand32(rbp, src);
3975}
3976
3977void Assembler::fsubr_s(Address src) {
3978  InstructionMark im(this);
3979  emit_byte(0xD8);
3980  emit_operand32(rbp, src);
3981}
3982
3983void Assembler::fsubra(int i) {
3984  emit_farith(0xDC, 0xE0, i);
3985}
3986
3987void Assembler::fsubrp(int i) {
3988  emit_farith(0xDE, 0xE0, i);                    // ST(0) <- ST(1) - ST(0) and pop (Intel manual wrong)
3989}
3990
3991void Assembler::ftan() {
3992  emit_byte(0xD9);
3993  emit_byte(0xF2);
3994  emit_byte(0xDD);
3995  emit_byte(0xD8);
3996}
3997
3998void Assembler::ftst() {
3999  emit_byte(0xD9);
4000  emit_byte(0xE4);
4001}
4002
4003void Assembler::fucomi(int i) {
4004  // make sure the instruction is supported (introduced for P6, together with cmov)
4005  guarantee(VM_Version::supports_cmov(), "illegal instruction");
4006  emit_farith(0xDB, 0xE8, i);
4007}
4008
4009void Assembler::fucomip(int i) {
4010  // make sure the instruction is supported (introduced for P6, together with cmov)
4011  guarantee(VM_Version::supports_cmov(), "illegal instruction");
4012  emit_farith(0xDF, 0xE8, i);
4013}
4014
4015void Assembler::fwait() {
4016  emit_byte(0x9B);
4017}
4018
4019void Assembler::fxch(int i) {
4020  emit_farith(0xD9, 0xC8, i);
4021}
4022
4023void Assembler::fyl2x() {
4024  emit_byte(0xD9);
4025  emit_byte(0xF1);
4026}
4027
4028void Assembler::frndint() {
4029  emit_byte(0xD9);
4030  emit_byte(0xFC);
4031}
4032
4033void Assembler::f2xm1() {
4034  emit_byte(0xD9);
4035  emit_byte(0xF0);
4036}
4037
4038void Assembler::fldl2e() {
4039  emit_byte(0xD9);
4040  emit_byte(0xEA);
4041}
4042
4043// SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding.
4044static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 };
4045// SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding.
4046static int simd_opc[4] = { 0,    0, 0x38, 0x3A };
4047
4048// Generate SSE legacy REX prefix and SIMD opcode based on VEX encoding.
4049void Assembler::rex_prefix(Address adr, XMMRegister xreg, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
4050  if (pre > 0) {
4051    emit_byte(simd_pre[pre]);
4052  }
4053  if (rex_w) {
4054    prefixq(adr, xreg);
4055  } else {
4056    prefix(adr, xreg);
4057  }
4058  if (opc > 0) {
4059    emit_byte(0x0F);
4060    int opc2 = simd_opc[opc];
4061    if (opc2 > 0) {
4062      emit_byte(opc2);
4063    }
4064  }
4065}
4066
4067int Assembler::rex_prefix_and_encode(int dst_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
4068  if (pre > 0) {
4069    emit_byte(simd_pre[pre]);
4070  }
4071  int encode = (rex_w) ? prefixq_and_encode(dst_enc, src_enc) :
4072                          prefix_and_encode(dst_enc, src_enc);
4073  if (opc > 0) {
4074    emit_byte(0x0F);
4075    int opc2 = simd_opc[opc];
4076    if (opc2 > 0) {
4077      emit_byte(opc2);
4078    }
4079  }
4080  return encode;
4081}
4082
4083
4084void Assembler::vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, int nds_enc, VexSimdPrefix pre, VexOpcode opc, bool vector256) {
4085  if (vex_b || vex_x || vex_w || (opc == VEX_OPCODE_0F_38) || (opc == VEX_OPCODE_0F_3A)) {
4086    prefix(VEX_3bytes);
4087
4088    int byte1 = (vex_r ? VEX_R : 0) | (vex_x ? VEX_X : 0) | (vex_b ? VEX_B : 0);
4089    byte1 = (~byte1) & 0xE0;
4090    byte1 |= opc;
4091    a_byte(byte1);
4092
4093    int byte2 = ((~nds_enc) & 0xf) << 3;
4094    byte2 |= (vex_w ? VEX_W : 0) | (vector256 ? 4 : 0) | pre;
4095    emit_byte(byte2);
4096  } else {
4097    prefix(VEX_2bytes);
4098
4099    int byte1 = vex_r ? VEX_R : 0;
4100    byte1 = (~byte1) & 0x80;
4101    byte1 |= ((~nds_enc) & 0xf) << 3;
4102    byte1 |= (vector256 ? 4 : 0) | pre;
4103    emit_byte(byte1);
4104  }
4105}
4106
4107void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256){
4108  bool vex_r = (xreg_enc >= 8);
4109  bool vex_b = adr.base_needs_rex();
4110  bool vex_x = adr.index_needs_rex();
4111  vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
4112}
4113
4114int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256) {
4115  bool vex_r = (dst_enc >= 8);
4116  bool vex_b = (src_enc >= 8);
4117  bool vex_x = false;
4118  vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
4119  return (((dst_enc & 7) << 3) | (src_enc & 7));
4120}
4121
4122
4123void Assembler::simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
4124  if (UseAVX > 0) {
4125    int xreg_enc = xreg->encoding();
4126    int  nds_enc = nds->is_valid() ? nds->encoding() : 0;
4127    vex_prefix(adr, nds_enc, xreg_enc, pre, opc, rex_w, vector256);
4128  } else {
4129    assert((nds == xreg) || (nds == xnoreg), "wrong sse encoding");
4130    rex_prefix(adr, xreg, pre, opc, rex_w);
4131  }
4132}
4133
4134int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
4135  int dst_enc = dst->encoding();
4136  int src_enc = src->encoding();
4137  if (UseAVX > 0) {
4138    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
4139    return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector256);
4140  } else {
4141    assert((nds == dst) || (nds == src) || (nds == xnoreg), "wrong sse encoding");
4142    return rex_prefix_and_encode(dst_enc, src_enc, pre, opc, rex_w);
4143  }
4144}
4145
4146void Assembler::emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) {
4147  InstructionMark im(this);
4148  simd_prefix(dst, dst, src, pre);
4149  emit_byte(opcode);
4150  emit_operand(dst, src);
4151}
4152
4153void Assembler::emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) {
4154  int encode = simd_prefix_and_encode(dst, dst, src, pre);
4155  emit_byte(opcode);
4156  emit_byte(0xC0 | encode);
4157}
4158
4159// Versions with no second source register (non-destructive source).
4160void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) {
4161  InstructionMark im(this);
4162  simd_prefix(dst, xnoreg, src, pre);
4163  emit_byte(opcode);
4164  emit_operand(dst, src);
4165}
4166
4167void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) {
4168  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre);
4169  emit_byte(opcode);
4170  emit_byte(0xC0 | encode);
4171}
4172
4173// 3-operands AVX instructions
4174void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
4175                               Address src, VexSimdPrefix pre, bool vector256) {
4176  InstructionMark im(this);
4177  vex_prefix(dst, nds, src, pre, vector256);
4178  emit_byte(opcode);
4179  emit_operand(dst, src);
4180}
4181
4182void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
4183                               XMMRegister src, VexSimdPrefix pre, bool vector256) {
4184  int encode = vex_prefix_and_encode(dst, nds, src, pre, vector256);
4185  emit_byte(opcode);
4186  emit_byte(0xC0 | encode);
4187}
4188
4189#ifndef _LP64
4190
4191void Assembler::incl(Register dst) {
4192  // Don't use it directly. Use MacroAssembler::incrementl() instead.
4193  emit_byte(0x40 | dst->encoding());
4194}
4195
4196void Assembler::lea(Register dst, Address src) {
4197  leal(dst, src);
4198}
4199
4200void Assembler::mov_literal32(Address dst, int32_t imm32,  RelocationHolder const& rspec) {
4201  InstructionMark im(this);
4202  emit_byte(0xC7);
4203  emit_operand(rax, dst);
4204  emit_data((int)imm32, rspec, 0);
4205}
4206
4207void Assembler::mov_literal32(Register dst, int32_t imm32, RelocationHolder const& rspec) {
4208  InstructionMark im(this);
4209  int encode = prefix_and_encode(dst->encoding());
4210  emit_byte(0xB8 | encode);
4211  emit_data((int)imm32, rspec, 0);
4212}
4213
4214void Assembler::popa() { // 32bit
4215  emit_byte(0x61);
4216}
4217
4218void Assembler::push_literal32(int32_t imm32, RelocationHolder const& rspec) {
4219  InstructionMark im(this);
4220  emit_byte(0x68);
4221  emit_data(imm32, rspec, 0);
4222}
4223
4224void Assembler::pusha() { // 32bit
4225  emit_byte(0x60);
4226}
4227
4228void Assembler::set_byte_if_not_zero(Register dst) {
4229  emit_byte(0x0F);
4230  emit_byte(0x95);
4231  emit_byte(0xE0 | dst->encoding());
4232}
4233
4234void Assembler::shldl(Register dst, Register src) {
4235  emit_byte(0x0F);
4236  emit_byte(0xA5);
4237  emit_byte(0xC0 | src->encoding() << 3 | dst->encoding());
4238}
4239
4240void Assembler::shrdl(Register dst, Register src) {
4241  emit_byte(0x0F);
4242  emit_byte(0xAD);
4243  emit_byte(0xC0 | src->encoding() << 3 | dst->encoding());
4244}
4245
4246#else // LP64
4247
4248void Assembler::set_byte_if_not_zero(Register dst) {
4249  int enc = prefix_and_encode(dst->encoding(), true);
4250  emit_byte(0x0F);
4251  emit_byte(0x95);
4252  emit_byte(0xE0 | enc);
4253}
4254
4255// 64bit only pieces of the assembler
4256// This should only be used by 64bit instructions that can use rip-relative
4257// it cannot be used by instructions that want an immediate value.
4258
4259bool Assembler::reachable(AddressLiteral adr) {
4260  int64_t disp;
4261  // None will force a 64bit literal to the code stream. Likely a placeholder
4262  // for something that will be patched later and we need to certain it will
4263  // always be reachable.
4264  if (adr.reloc() == relocInfo::none) {
4265    return false;
4266  }
4267  if (adr.reloc() == relocInfo::internal_word_type) {
4268    // This should be rip relative and easily reachable.
4269    return true;
4270  }
4271  if (adr.reloc() == relocInfo::virtual_call_type ||
4272      adr.reloc() == relocInfo::opt_virtual_call_type ||
4273      adr.reloc() == relocInfo::static_call_type ||
4274      adr.reloc() == relocInfo::static_stub_type ) {
4275    // This should be rip relative within the code cache and easily
4276    // reachable until we get huge code caches. (At which point
4277    // ic code is going to have issues).
4278    return true;
4279  }
4280  if (adr.reloc() != relocInfo::external_word_type &&
4281      adr.reloc() != relocInfo::poll_return_type &&  // these are really external_word but need special
4282      adr.reloc() != relocInfo::poll_type &&         // relocs to identify them
4283      adr.reloc() != relocInfo::runtime_call_type ) {
4284    return false;
4285  }
4286
4287  // Stress the correction code
4288  if (ForceUnreachable) {
4289    // Must be runtimecall reloc, see if it is in the codecache
4290    // Flipping stuff in the codecache to be unreachable causes issues
4291    // with things like inline caches where the additional instructions
4292    // are not handled.
4293    if (CodeCache::find_blob(adr._target) == NULL) {
4294      return false;
4295    }
4296  }
4297  // For external_word_type/runtime_call_type if it is reachable from where we
4298  // are now (possibly a temp buffer) and where we might end up
4299  // anywhere in the codeCache then we are always reachable.
4300  // This would have to change if we ever save/restore shared code
4301  // to be more pessimistic.
4302  disp = (int64_t)adr._target - ((int64_t)CodeCache::low_bound() + sizeof(int));
4303  if (!is_simm32(disp)) return false;
4304  disp = (int64_t)adr._target - ((int64_t)CodeCache::high_bound() + sizeof(int));
4305  if (!is_simm32(disp)) return false;
4306
4307  disp = (int64_t)adr._target - ((int64_t)_code_pos + sizeof(int));
4308
4309  // Because rip relative is a disp + address_of_next_instruction and we
4310  // don't know the value of address_of_next_instruction we apply a fudge factor
4311  // to make sure we will be ok no matter the size of the instruction we get placed into.
4312  // We don't have to fudge the checks above here because they are already worst case.
4313
4314  // 12 == override/rex byte, opcode byte, rm byte, sib byte, a 4-byte disp , 4-byte literal
4315  // + 4 because better safe than sorry.
4316  const int fudge = 12 + 4;
4317  if (disp < 0) {
4318    disp -= fudge;
4319  } else {
4320    disp += fudge;
4321  }
4322  return is_simm32(disp);
4323}
4324
4325// Check if the polling page is not reachable from the code cache using rip-relative
4326// addressing.
4327bool Assembler::is_polling_page_far() {
4328  intptr_t addr = (intptr_t)os::get_polling_page();
4329  return ForceUnreachable ||
4330         !is_simm32(addr - (intptr_t)CodeCache::low_bound()) ||
4331         !is_simm32(addr - (intptr_t)CodeCache::high_bound());
4332}
4333
4334void Assembler::emit_data64(jlong data,
4335                            relocInfo::relocType rtype,
4336                            int format) {
4337  if (rtype == relocInfo::none) {
4338    emit_long64(data);
4339  } else {
4340    emit_data64(data, Relocation::spec_simple(rtype), format);
4341  }
4342}
4343
4344void Assembler::emit_data64(jlong data,
4345                            RelocationHolder const& rspec,
4346                            int format) {
4347  assert(imm_operand == 0, "default format must be immediate in this file");
4348  assert(imm_operand == format, "must be immediate");
4349  assert(inst_mark() != NULL, "must be inside InstructionMark");
4350  // Do not use AbstractAssembler::relocate, which is not intended for
4351  // embedded words.  Instead, relocate to the enclosing instruction.
4352  code_section()->relocate(inst_mark(), rspec, format);
4353#ifdef ASSERT
4354  check_relocation(rspec, format);
4355#endif
4356  emit_long64(data);
4357}
4358
4359int Assembler::prefix_and_encode(int reg_enc, bool byteinst) {
4360  if (reg_enc >= 8) {
4361    prefix(REX_B);
4362    reg_enc -= 8;
4363  } else if (byteinst && reg_enc >= 4) {
4364    prefix(REX);
4365  }
4366  return reg_enc;
4367}
4368
4369int Assembler::prefixq_and_encode(int reg_enc) {
4370  if (reg_enc < 8) {
4371    prefix(REX_W);
4372  } else {
4373    prefix(REX_WB);
4374    reg_enc -= 8;
4375  }
4376  return reg_enc;
4377}
4378
4379int Assembler::prefix_and_encode(int dst_enc, int src_enc, bool byteinst) {
4380  if (dst_enc < 8) {
4381    if (src_enc >= 8) {
4382      prefix(REX_B);
4383      src_enc -= 8;
4384    } else if (byteinst && src_enc >= 4) {
4385      prefix(REX);
4386    }
4387  } else {
4388    if (src_enc < 8) {
4389      prefix(REX_R);
4390    } else {
4391      prefix(REX_RB);
4392      src_enc -= 8;
4393    }
4394    dst_enc -= 8;
4395  }
4396  return dst_enc << 3 | src_enc;
4397}
4398
4399int Assembler::prefixq_and_encode(int dst_enc, int src_enc) {
4400  if (dst_enc < 8) {
4401    if (src_enc < 8) {
4402      prefix(REX_W);
4403    } else {
4404      prefix(REX_WB);
4405      src_enc -= 8;
4406    }
4407  } else {
4408    if (src_enc < 8) {
4409      prefix(REX_WR);
4410    } else {
4411      prefix(REX_WRB);
4412      src_enc -= 8;
4413    }
4414    dst_enc -= 8;
4415  }
4416  return dst_enc << 3 | src_enc;
4417}
4418
4419void Assembler::prefix(Register reg) {
4420  if (reg->encoding() >= 8) {
4421    prefix(REX_B);
4422  }
4423}
4424
4425void Assembler::prefix(Address adr) {
4426  if (adr.base_needs_rex()) {
4427    if (adr.index_needs_rex()) {
4428      prefix(REX_XB);
4429    } else {
4430      prefix(REX_B);
4431    }
4432  } else {
4433    if (adr.index_needs_rex()) {
4434      prefix(REX_X);
4435    }
4436  }
4437}
4438
4439void Assembler::prefixq(Address adr) {
4440  if (adr.base_needs_rex()) {
4441    if (adr.index_needs_rex()) {
4442      prefix(REX_WXB);
4443    } else {
4444      prefix(REX_WB);
4445    }
4446  } else {
4447    if (adr.index_needs_rex()) {
4448      prefix(REX_WX);
4449    } else {
4450      prefix(REX_W);
4451    }
4452  }
4453}
4454
4455
4456void Assembler::prefix(Address adr, Register reg, bool byteinst) {
4457  if (reg->encoding() < 8) {
4458    if (adr.base_needs_rex()) {
4459      if (adr.index_needs_rex()) {
4460        prefix(REX_XB);
4461      } else {
4462        prefix(REX_B);
4463      }
4464    } else {
4465      if (adr.index_needs_rex()) {
4466        prefix(REX_X);
4467      } else if (byteinst && reg->encoding() >= 4 ) {
4468        prefix(REX);
4469      }
4470    }
4471  } else {
4472    if (adr.base_needs_rex()) {
4473      if (adr.index_needs_rex()) {
4474        prefix(REX_RXB);
4475      } else {
4476        prefix(REX_RB);
4477      }
4478    } else {
4479      if (adr.index_needs_rex()) {
4480        prefix(REX_RX);
4481      } else {
4482        prefix(REX_R);
4483      }
4484    }
4485  }
4486}
4487
4488void Assembler::prefixq(Address adr, Register src) {
4489  if (src->encoding() < 8) {
4490    if (adr.base_needs_rex()) {
4491      if (adr.index_needs_rex()) {
4492        prefix(REX_WXB);
4493      } else {
4494        prefix(REX_WB);
4495      }
4496    } else {
4497      if (adr.index_needs_rex()) {
4498        prefix(REX_WX);
4499      } else {
4500        prefix(REX_W);
4501      }
4502    }
4503  } else {
4504    if (adr.base_needs_rex()) {
4505      if (adr.index_needs_rex()) {
4506        prefix(REX_WRXB);
4507      } else {
4508        prefix(REX_WRB);
4509      }
4510    } else {
4511      if (adr.index_needs_rex()) {
4512        prefix(REX_WRX);
4513      } else {
4514        prefix(REX_WR);
4515      }
4516    }
4517  }
4518}
4519
4520void Assembler::prefix(Address adr, XMMRegister reg) {
4521  if (reg->encoding() < 8) {
4522    if (adr.base_needs_rex()) {
4523      if (adr.index_needs_rex()) {
4524        prefix(REX_XB);
4525      } else {
4526        prefix(REX_B);
4527      }
4528    } else {
4529      if (adr.index_needs_rex()) {
4530        prefix(REX_X);
4531      }
4532    }
4533  } else {
4534    if (adr.base_needs_rex()) {
4535      if (adr.index_needs_rex()) {
4536        prefix(REX_RXB);
4537      } else {
4538        prefix(REX_RB);
4539      }
4540    } else {
4541      if (adr.index_needs_rex()) {
4542        prefix(REX_RX);
4543      } else {
4544        prefix(REX_R);
4545      }
4546    }
4547  }
4548}
4549
4550void Assembler::prefixq(Address adr, XMMRegister src) {
4551  if (src->encoding() < 8) {
4552    if (adr.base_needs_rex()) {
4553      if (adr.index_needs_rex()) {
4554        prefix(REX_WXB);
4555      } else {
4556        prefix(REX_WB);
4557      }
4558    } else {
4559      if (adr.index_needs_rex()) {
4560        prefix(REX_WX);
4561      } else {
4562        prefix(REX_W);
4563      }
4564    }
4565  } else {
4566    if (adr.base_needs_rex()) {
4567      if (adr.index_needs_rex()) {
4568        prefix(REX_WRXB);
4569      } else {
4570        prefix(REX_WRB);
4571      }
4572    } else {
4573      if (adr.index_needs_rex()) {
4574        prefix(REX_WRX);
4575      } else {
4576        prefix(REX_WR);
4577      }
4578    }
4579  }
4580}
4581
4582void Assembler::adcq(Register dst, int32_t imm32) {
4583  (void) prefixq_and_encode(dst->encoding());
4584  emit_arith(0x81, 0xD0, dst, imm32);
4585}
4586
4587void Assembler::adcq(Register dst, Address src) {
4588  InstructionMark im(this);
4589  prefixq(src, dst);
4590  emit_byte(0x13);
4591  emit_operand(dst, src);
4592}
4593
4594void Assembler::adcq(Register dst, Register src) {
4595  (int) prefixq_and_encode(dst->encoding(), src->encoding());
4596  emit_arith(0x13, 0xC0, dst, src);
4597}
4598
4599void Assembler::addq(Address dst, int32_t imm32) {
4600  InstructionMark im(this);
4601  prefixq(dst);
4602  emit_arith_operand(0x81, rax, dst,imm32);
4603}
4604
4605void Assembler::addq(Address dst, Register src) {
4606  InstructionMark im(this);
4607  prefixq(dst, src);
4608  emit_byte(0x01);
4609  emit_operand(src, dst);
4610}
4611
4612void Assembler::addq(Register dst, int32_t imm32) {
4613  (void) prefixq_and_encode(dst->encoding());
4614  emit_arith(0x81, 0xC0, dst, imm32);
4615}
4616
4617void Assembler::addq(Register dst, Address src) {
4618  InstructionMark im(this);
4619  prefixq(src, dst);
4620  emit_byte(0x03);
4621  emit_operand(dst, src);
4622}
4623
4624void Assembler::addq(Register dst, Register src) {
4625  (void) prefixq_and_encode(dst->encoding(), src->encoding());
4626  emit_arith(0x03, 0xC0, dst, src);
4627}
4628
4629void Assembler::andq(Address dst, int32_t imm32) {
4630  InstructionMark im(this);
4631  prefixq(dst);
4632  emit_byte(0x81);
4633  emit_operand(rsp, dst, 4);
4634  emit_long(imm32);
4635}
4636
4637void Assembler::andq(Register dst, int32_t imm32) {
4638  (void) prefixq_and_encode(dst->encoding());
4639  emit_arith(0x81, 0xE0, dst, imm32);
4640}
4641
4642void Assembler::andq(Register dst, Address src) {
4643  InstructionMark im(this);
4644  prefixq(src, dst);
4645  emit_byte(0x23);
4646  emit_operand(dst, src);
4647}
4648
4649void Assembler::andq(Register dst, Register src) {
4650  (int) prefixq_and_encode(dst->encoding(), src->encoding());
4651  emit_arith(0x23, 0xC0, dst, src);
4652}
4653
4654void Assembler::bsfq(Register dst, Register src) {
4655  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4656  emit_byte(0x0F);
4657  emit_byte(0xBC);
4658  emit_byte(0xC0 | encode);
4659}
4660
4661void Assembler::bsrq(Register dst, Register src) {
4662  assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
4663  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4664  emit_byte(0x0F);
4665  emit_byte(0xBD);
4666  emit_byte(0xC0 | encode);
4667}
4668
4669void Assembler::bswapq(Register reg) {
4670  int encode = prefixq_and_encode(reg->encoding());
4671  emit_byte(0x0F);
4672  emit_byte(0xC8 | encode);
4673}
4674
4675void Assembler::cdqq() {
4676  prefix(REX_W);
4677  emit_byte(0x99);
4678}
4679
4680void Assembler::clflush(Address adr) {
4681  prefix(adr);
4682  emit_byte(0x0F);
4683  emit_byte(0xAE);
4684  emit_operand(rdi, adr);
4685}
4686
4687void Assembler::cmovq(Condition cc, Register dst, Register src) {
4688  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4689  emit_byte(0x0F);
4690  emit_byte(0x40 | cc);
4691  emit_byte(0xC0 | encode);
4692}
4693
4694void Assembler::cmovq(Condition cc, Register dst, Address src) {
4695  InstructionMark im(this);
4696  prefixq(src, dst);
4697  emit_byte(0x0F);
4698  emit_byte(0x40 | cc);
4699  emit_operand(dst, src);
4700}
4701
4702void Assembler::cmpq(Address dst, int32_t imm32) {
4703  InstructionMark im(this);
4704  prefixq(dst);
4705  emit_byte(0x81);
4706  emit_operand(rdi, dst, 4);
4707  emit_long(imm32);
4708}
4709
4710void Assembler::cmpq(Register dst, int32_t imm32) {
4711  (void) prefixq_and_encode(dst->encoding());
4712  emit_arith(0x81, 0xF8, dst, imm32);
4713}
4714
4715void Assembler::cmpq(Address dst, Register src) {
4716  InstructionMark im(this);
4717  prefixq(dst, src);
4718  emit_byte(0x3B);
4719  emit_operand(src, dst);
4720}
4721
4722void Assembler::cmpq(Register dst, Register src) {
4723  (void) prefixq_and_encode(dst->encoding(), src->encoding());
4724  emit_arith(0x3B, 0xC0, dst, src);
4725}
4726
4727void Assembler::cmpq(Register dst, Address  src) {
4728  InstructionMark im(this);
4729  prefixq(src, dst);
4730  emit_byte(0x3B);
4731  emit_operand(dst, src);
4732}
4733
4734void Assembler::cmpxchgq(Register reg, Address adr) {
4735  InstructionMark im(this);
4736  prefixq(adr, reg);
4737  emit_byte(0x0F);
4738  emit_byte(0xB1);
4739  emit_operand(reg, adr);
4740}
4741
4742void Assembler::cvtsi2sdq(XMMRegister dst, Register src) {
4743  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4744  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2);
4745  emit_byte(0x2A);
4746  emit_byte(0xC0 | encode);
4747}
4748
4749void Assembler::cvtsi2sdq(XMMRegister dst, Address src) {
4750  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4751  InstructionMark im(this);
4752  simd_prefix_q(dst, dst, src, VEX_SIMD_F2);
4753  emit_byte(0x2A);
4754  emit_operand(dst, src);
4755}
4756
4757void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
4758  NOT_LP64(assert(VM_Version::supports_sse(), ""));
4759  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3);
4760  emit_byte(0x2A);
4761  emit_byte(0xC0 | encode);
4762}
4763
4764void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
4765  NOT_LP64(assert(VM_Version::supports_sse(), ""));
4766  InstructionMark im(this);
4767  simd_prefix_q(dst, dst, src, VEX_SIMD_F3);
4768  emit_byte(0x2A);
4769  emit_operand(dst, src);
4770}
4771
4772void Assembler::cvttsd2siq(Register dst, XMMRegister src) {
4773  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4774  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2);
4775  emit_byte(0x2C);
4776  emit_byte(0xC0 | encode);
4777}
4778
4779void Assembler::cvttss2siq(Register dst, XMMRegister src) {
4780  NOT_LP64(assert(VM_Version::supports_sse(), ""));
4781  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3);
4782  emit_byte(0x2C);
4783  emit_byte(0xC0 | encode);
4784}
4785
4786void Assembler::decl(Register dst) {
4787  // Don't use it directly. Use MacroAssembler::decrementl() instead.
4788  // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
4789  int encode = prefix_and_encode(dst->encoding());
4790  emit_byte(0xFF);
4791  emit_byte(0xC8 | encode);
4792}
4793
4794void Assembler::decq(Register dst) {
4795  // Don't use it directly. Use MacroAssembler::decrementq() instead.
4796  // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4797  int encode = prefixq_and_encode(dst->encoding());
4798  emit_byte(0xFF);
4799  emit_byte(0xC8 | encode);
4800}
4801
4802void Assembler::decq(Address dst) {
4803  // Don't use it directly. Use MacroAssembler::decrementq() instead.
4804  InstructionMark im(this);
4805  prefixq(dst);
4806  emit_byte(0xFF);
4807  emit_operand(rcx, dst);
4808}
4809
4810void Assembler::fxrstor(Address src) {
4811  prefixq(src);
4812  emit_byte(0x0F);
4813  emit_byte(0xAE);
4814  emit_operand(as_Register(1), src);
4815}
4816
4817void Assembler::fxsave(Address dst) {
4818  prefixq(dst);
4819  emit_byte(0x0F);
4820  emit_byte(0xAE);
4821  emit_operand(as_Register(0), dst);
4822}
4823
4824void Assembler::idivq(Register src) {
4825  int encode = prefixq_and_encode(src->encoding());
4826  emit_byte(0xF7);
4827  emit_byte(0xF8 | encode);
4828}
4829
4830void Assembler::imulq(Register dst, Register src) {
4831  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4832  emit_byte(0x0F);
4833  emit_byte(0xAF);
4834  emit_byte(0xC0 | encode);
4835}
4836
4837void Assembler::imulq(Register dst, Register src, int value) {
4838  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4839  if (is8bit(value)) {
4840    emit_byte(0x6B);
4841    emit_byte(0xC0 | encode);
4842    emit_byte(value & 0xFF);
4843  } else {
4844    emit_byte(0x69);
4845    emit_byte(0xC0 | encode);
4846    emit_long(value);
4847  }
4848}
4849
4850void Assembler::incl(Register dst) {
4851  // Don't use it directly. Use MacroAssembler::incrementl() instead.
4852  // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4853  int encode = prefix_and_encode(dst->encoding());
4854  emit_byte(0xFF);
4855  emit_byte(0xC0 | encode);
4856}
4857
4858void Assembler::incq(Register dst) {
4859  // Don't use it directly. Use MacroAssembler::incrementq() instead.
4860  // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4861  int encode = prefixq_and_encode(dst->encoding());
4862  emit_byte(0xFF);
4863  emit_byte(0xC0 | encode);
4864}
4865
4866void Assembler::incq(Address dst) {
4867  // Don't use it directly. Use MacroAssembler::incrementq() instead.
4868  InstructionMark im(this);
4869  prefixq(dst);
4870  emit_byte(0xFF);
4871  emit_operand(rax, dst);
4872}
4873
4874void Assembler::lea(Register dst, Address src) {
4875  leaq(dst, src);
4876}
4877
4878void Assembler::leaq(Register dst, Address src) {
4879  InstructionMark im(this);
4880  prefixq(src, dst);
4881  emit_byte(0x8D);
4882  emit_operand(dst, src);
4883}
4884
4885void Assembler::mov64(Register dst, int64_t imm64) {
4886  InstructionMark im(this);
4887  int encode = prefixq_and_encode(dst->encoding());
4888  emit_byte(0xB8 | encode);
4889  emit_long64(imm64);
4890}
4891
4892void Assembler::mov_literal64(Register dst, intptr_t imm64, RelocationHolder const& rspec) {
4893  InstructionMark im(this);
4894  int encode = prefixq_and_encode(dst->encoding());
4895  emit_byte(0xB8 | encode);
4896  emit_data64(imm64, rspec);
4897}
4898
4899void Assembler::mov_narrow_oop(Register dst, int32_t imm32, RelocationHolder const& rspec) {
4900  InstructionMark im(this);
4901  int encode = prefix_and_encode(dst->encoding());
4902  emit_byte(0xB8 | encode);
4903  emit_data((int)imm32, rspec, narrow_oop_operand);
4904}
4905
4906void Assembler::mov_narrow_oop(Address dst, int32_t imm32,  RelocationHolder const& rspec) {
4907  InstructionMark im(this);
4908  prefix(dst);
4909  emit_byte(0xC7);
4910  emit_operand(rax, dst, 4);
4911  emit_data((int)imm32, rspec, narrow_oop_operand);
4912}
4913
4914void Assembler::cmp_narrow_oop(Register src1, int32_t imm32, RelocationHolder const& rspec) {
4915  InstructionMark im(this);
4916  int encode = prefix_and_encode(src1->encoding());
4917  emit_byte(0x81);
4918  emit_byte(0xF8 | encode);
4919  emit_data((int)imm32, rspec, narrow_oop_operand);
4920}
4921
4922void Assembler::cmp_narrow_oop(Address src1, int32_t imm32, RelocationHolder const& rspec) {
4923  InstructionMark im(this);
4924  prefix(src1);
4925  emit_byte(0x81);
4926  emit_operand(rax, src1, 4);
4927  emit_data((int)imm32, rspec, narrow_oop_operand);
4928}
4929
4930void Assembler::lzcntq(Register dst, Register src) {
4931  assert(VM_Version::supports_lzcnt(), "encoding is treated as BSR");
4932  emit_byte(0xF3);
4933  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4934  emit_byte(0x0F);
4935  emit_byte(0xBD);
4936  emit_byte(0xC0 | encode);
4937}
4938
4939void Assembler::movdq(XMMRegister dst, Register src) {
4940  // table D-1 says MMX/SSE2
4941  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4942  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66);
4943  emit_byte(0x6E);
4944  emit_byte(0xC0 | encode);
4945}
4946
4947void Assembler::movdq(Register dst, XMMRegister src) {
4948  // table D-1 says MMX/SSE2
4949  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4950  // swap src/dst to get correct prefix
4951  int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66);
4952  emit_byte(0x7E);
4953  emit_byte(0xC0 | encode);
4954}
4955
4956void Assembler::movq(Register dst, Register src) {
4957  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4958  emit_byte(0x8B);
4959  emit_byte(0xC0 | encode);
4960}
4961
4962void Assembler::movq(Register dst, Address src) {
4963  InstructionMark im(this);
4964  prefixq(src, dst);
4965  emit_byte(0x8B);
4966  emit_operand(dst, src);
4967}
4968
4969void Assembler::movq(Address dst, Register src) {
4970  InstructionMark im(this);
4971  prefixq(dst, src);
4972  emit_byte(0x89);
4973  emit_operand(src, dst);
4974}
4975
4976void Assembler::movsbq(Register dst, Address src) {
4977  InstructionMark im(this);
4978  prefixq(src, dst);
4979  emit_byte(0x0F);
4980  emit_byte(0xBE);
4981  emit_operand(dst, src);
4982}
4983
4984void Assembler::movsbq(Register dst, Register src) {
4985  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4986  emit_byte(0x0F);
4987  emit_byte(0xBE);
4988  emit_byte(0xC0 | encode);
4989}
4990
4991void Assembler::movslq(Register dst, int32_t imm32) {
4992  // dbx shows movslq(rcx, 3) as movq     $0x0000000049000000,(%rbx)
4993  // and movslq(r8, 3); as movl     $0x0000000048000000,(%rbx)
4994  // as a result we shouldn't use until tested at runtime...
4995  ShouldNotReachHere();
4996  InstructionMark im(this);
4997  int encode = prefixq_and_encode(dst->encoding());
4998  emit_byte(0xC7 | encode);
4999  emit_long(imm32);
5000}
5001
5002void Assembler::movslq(Address dst, int32_t imm32) {
5003  assert(is_simm32(imm32), "lost bits");
5004  InstructionMark im(this);
5005  prefixq(dst);
5006  emit_byte(0xC7);
5007  emit_operand(rax, dst, 4);
5008  emit_long(imm32);
5009}
5010
5011void Assembler::movslq(Register dst, Address src) {
5012  InstructionMark im(this);
5013  prefixq(src, dst);
5014  emit_byte(0x63);
5015  emit_operand(dst, src);
5016}
5017
5018void Assembler::movslq(Register dst, Register src) {
5019  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
5020  emit_byte(0x63);
5021  emit_byte(0xC0 | encode);
5022}
5023
5024void Assembler::movswq(Register dst, Address src) {
5025  InstructionMark im(this);
5026  prefixq(src, dst);
5027  emit_byte(0x0F);
5028  emit_byte(0xBF);
5029  emit_operand(dst, src);
5030}
5031
5032void Assembler::movswq(Register dst, Register src) {
5033  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
5034  emit_byte(0x0F);
5035  emit_byte(0xBF);
5036  emit_byte(0xC0 | encode);
5037}
5038
5039void Assembler::movzbq(Register dst, Address src) {
5040  InstructionMark im(this);
5041  prefixq(src, dst);
5042  emit_byte(0x0F);
5043  emit_byte(0xB6);
5044  emit_operand(dst, src);
5045}
5046
5047void Assembler::movzbq(Register dst, Register src) {
5048  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
5049  emit_byte(0x0F);
5050  emit_byte(0xB6);
5051  emit_byte(0xC0 | encode);
5052}
5053
5054void Assembler::movzwq(Register dst, Address src) {
5055  InstructionMark im(this);
5056  prefixq(src, dst);
5057  emit_byte(0x0F);
5058  emit_byte(0xB7);
5059  emit_operand(dst, src);
5060}
5061
5062void Assembler::movzwq(Register dst, Register src) {
5063  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
5064  emit_byte(0x0F);
5065  emit_byte(0xB7);
5066  emit_byte(0xC0 | encode);
5067}
5068
5069void Assembler::negq(Register dst) {
5070  int encode = prefixq_and_encode(dst->encoding());
5071  emit_byte(0xF7);
5072  emit_byte(0xD8 | encode);
5073}
5074
5075void Assembler::notq(Register dst) {
5076  int encode = prefixq_and_encode(dst->encoding());
5077  emit_byte(0xF7);
5078  emit_byte(0xD0 | encode);
5079}
5080
5081void Assembler::orq(Address dst, int32_t imm32) {
5082  InstructionMark im(this);
5083  prefixq(dst);
5084  emit_byte(0x81);
5085  emit_operand(rcx, dst, 4);
5086  emit_long(imm32);
5087}
5088
5089void Assembler::orq(Register dst, int32_t imm32) {
5090  (void) prefixq_and_encode(dst->encoding());
5091  emit_arith(0x81, 0xC8, dst, imm32);
5092}
5093
5094void Assembler::orq(Register dst, Address src) {
5095  InstructionMark im(this);
5096  prefixq(src, dst);
5097  emit_byte(0x0B);
5098  emit_operand(dst, src);
5099}
5100
5101void Assembler::orq(Register dst, Register src) {
5102  (void) prefixq_and_encode(dst->encoding(), src->encoding());
5103  emit_arith(0x0B, 0xC0, dst, src);
5104}
5105
5106void Assembler::popa() { // 64bit
5107  movq(r15, Address(rsp, 0));
5108  movq(r14, Address(rsp, wordSize));
5109  movq(r13, Address(rsp, 2 * wordSize));
5110  movq(r12, Address(rsp, 3 * wordSize));
5111  movq(r11, Address(rsp, 4 * wordSize));
5112  movq(r10, Address(rsp, 5 * wordSize));
5113  movq(r9,  Address(rsp, 6 * wordSize));
5114  movq(r8,  Address(rsp, 7 * wordSize));
5115  movq(rdi, Address(rsp, 8 * wordSize));
5116  movq(rsi, Address(rsp, 9 * wordSize));
5117  movq(rbp, Address(rsp, 10 * wordSize));
5118  // skip rsp
5119  movq(rbx, Address(rsp, 12 * wordSize));
5120  movq(rdx, Address(rsp, 13 * wordSize));
5121  movq(rcx, Address(rsp, 14 * wordSize));
5122  movq(rax, Address(rsp, 15 * wordSize));
5123
5124  addq(rsp, 16 * wordSize);
5125}
5126
5127void Assembler::popcntq(Register dst, Address src) {
5128  assert(VM_Version::supports_popcnt(), "must support");
5129  InstructionMark im(this);
5130  emit_byte(0xF3);
5131  prefixq(src, dst);
5132  emit_byte(0x0F);
5133  emit_byte(0xB8);
5134  emit_operand(dst, src);
5135}
5136
5137void Assembler::popcntq(Register dst, Register src) {
5138  assert(VM_Version::supports_popcnt(), "must support");
5139  emit_byte(0xF3);
5140  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
5141  emit_byte(0x0F);
5142  emit_byte(0xB8);
5143  emit_byte(0xC0 | encode);
5144}
5145
5146void Assembler::popq(Address dst) {
5147  InstructionMark im(this);
5148  prefixq(dst);
5149  emit_byte(0x8F);
5150  emit_operand(rax, dst);
5151}
5152
5153void Assembler::pusha() { // 64bit
5154  // we have to store original rsp.  ABI says that 128 bytes
5155  // below rsp are local scratch.
5156  movq(Address(rsp, -5 * wordSize), rsp);
5157
5158  subq(rsp, 16 * wordSize);
5159
5160  movq(Address(rsp, 15 * wordSize), rax);
5161  movq(Address(rsp, 14 * wordSize), rcx);
5162  movq(Address(rsp, 13 * wordSize), rdx);
5163  movq(Address(rsp, 12 * wordSize), rbx);
5164  // skip rsp
5165  movq(Address(rsp, 10 * wordSize), rbp);
5166  movq(Address(rsp, 9 * wordSize), rsi);
5167  movq(Address(rsp, 8 * wordSize), rdi);
5168  movq(Address(rsp, 7 * wordSize), r8);
5169  movq(Address(rsp, 6 * wordSize), r9);
5170  movq(Address(rsp, 5 * wordSize), r10);
5171  movq(Address(rsp, 4 * wordSize), r11);
5172  movq(Address(rsp, 3 * wordSize), r12);
5173  movq(Address(rsp, 2 * wordSize), r13);
5174  movq(Address(rsp, wordSize), r14);
5175  movq(Address(rsp, 0), r15);
5176}
5177
5178void Assembler::pushq(Address src) {
5179  InstructionMark im(this);
5180  prefixq(src);
5181  emit_byte(0xFF);
5182  emit_operand(rsi, src);
5183}
5184
5185void Assembler::rclq(Register dst, int imm8) {
5186  assert(isShiftCount(imm8 >> 1), "illegal shift count");
5187  int encode = prefixq_and_encode(dst->encoding());
5188  if (imm8 == 1) {
5189    emit_byte(0xD1);
5190    emit_byte(0xD0 | encode);
5191  } else {
5192    emit_byte(0xC1);
5193    emit_byte(0xD0 | encode);
5194    emit_byte(imm8);
5195  }
5196}
5197void Assembler::sarq(Register dst, int imm8) {
5198  assert(isShiftCount(imm8 >> 1), "illegal shift count");
5199  int encode = prefixq_and_encode(dst->encoding());
5200  if (imm8 == 1) {
5201    emit_byte(0xD1);
5202    emit_byte(0xF8 | encode);
5203  } else {
5204    emit_byte(0xC1);
5205    emit_byte(0xF8 | encode);
5206    emit_byte(imm8);
5207  }
5208}
5209
5210void Assembler::sarq(Register dst) {
5211  int encode = prefixq_and_encode(dst->encoding());
5212  emit_byte(0xD3);
5213  emit_byte(0xF8 | encode);
5214}
5215
5216void Assembler::sbbq(Address dst, int32_t imm32) {
5217  InstructionMark im(this);
5218  prefixq(dst);
5219  emit_arith_operand(0x81, rbx, dst, imm32);
5220}
5221
5222void Assembler::sbbq(Register dst, int32_t imm32) {
5223  (void) prefixq_and_encode(dst->encoding());
5224  emit_arith(0x81, 0xD8, dst, imm32);
5225}
5226
5227void Assembler::sbbq(Register dst, Address src) {
5228  InstructionMark im(this);
5229  prefixq(src, dst);
5230  emit_byte(0x1B);
5231  emit_operand(dst, src);
5232}
5233
5234void Assembler::sbbq(Register dst, Register src) {
5235  (void) prefixq_and_encode(dst->encoding(), src->encoding());
5236  emit_arith(0x1B, 0xC0, dst, src);
5237}
5238
5239void Assembler::shlq(Register dst, int imm8) {
5240  assert(isShiftCount(imm8 >> 1), "illegal shift count");
5241  int encode = prefixq_and_encode(dst->encoding());
5242  if (imm8 == 1) {
5243    emit_byte(0xD1);
5244    emit_byte(0xE0 | encode);
5245  } else {
5246    emit_byte(0xC1);
5247    emit_byte(0xE0 | encode);
5248    emit_byte(imm8);
5249  }
5250}
5251
5252void Assembler::shlq(Register dst) {
5253  int encode = prefixq_and_encode(dst->encoding());
5254  emit_byte(0xD3);
5255  emit_byte(0xE0 | encode);
5256}
5257
5258void Assembler::shrq(Register dst, int imm8) {
5259  assert(isShiftCount(imm8 >> 1), "illegal shift count");
5260  int encode = prefixq_and_encode(dst->encoding());
5261  emit_byte(0xC1);
5262  emit_byte(0xE8 | encode);
5263  emit_byte(imm8);
5264}
5265
5266void Assembler::shrq(Register dst) {
5267  int encode = prefixq_and_encode(dst->encoding());
5268  emit_byte(0xD3);
5269  emit_byte(0xE8 | encode);
5270}
5271
5272void Assembler::subq(Address dst, int32_t imm32) {
5273  InstructionMark im(this);
5274  prefixq(dst);
5275  emit_arith_operand(0x81, rbp, dst, imm32);
5276}
5277
5278void Assembler::subq(Address dst, Register src) {
5279  InstructionMark im(this);
5280  prefixq(dst, src);
5281  emit_byte(0x29);
5282  emit_operand(src, dst);
5283}
5284
5285void Assembler::subq(Register dst, int32_t imm32) {
5286  (void) prefixq_and_encode(dst->encoding());
5287  emit_arith(0x81, 0xE8, dst, imm32);
5288}
5289
5290// Force generation of a 4 byte immediate value even if it fits into 8bit
5291void Assembler::subq_imm32(Register dst, int32_t imm32) {
5292  (void) prefixq_and_encode(dst->encoding());
5293  emit_arith_imm32(0x81, 0xE8, dst, imm32);
5294}
5295
5296void Assembler::subq(Register dst, Address src) {
5297  InstructionMark im(this);
5298  prefixq(src, dst);
5299  emit_byte(0x2B);
5300  emit_operand(dst, src);
5301}
5302
5303void Assembler::subq(Register dst, Register src) {
5304  (void) prefixq_and_encode(dst->encoding(), src->encoding());
5305  emit_arith(0x2B, 0xC0, dst, src);
5306}
5307
5308void Assembler::testq(Register dst, int32_t imm32) {
5309  // not using emit_arith because test
5310  // doesn't support sign-extension of
5311  // 8bit operands
5312  int encode = dst->encoding();
5313  if (encode == 0) {
5314    prefix(REX_W);
5315    emit_byte(0xA9);
5316  } else {
5317    encode = prefixq_and_encode(encode);
5318    emit_byte(0xF7);
5319    emit_byte(0xC0 | encode);
5320  }
5321  emit_long(imm32);
5322}
5323
5324void Assembler::testq(Register dst, Register src) {
5325  (void) prefixq_and_encode(dst->encoding(), src->encoding());
5326  emit_arith(0x85, 0xC0, dst, src);
5327}
5328
5329void Assembler::xaddq(Address dst, Register src) {
5330  InstructionMark im(this);
5331  prefixq(dst, src);
5332  emit_byte(0x0F);
5333  emit_byte(0xC1);
5334  emit_operand(src, dst);
5335}
5336
5337void Assembler::xchgq(Register dst, Address src) {
5338  InstructionMark im(this);
5339  prefixq(src, dst);
5340  emit_byte(0x87);
5341  emit_operand(dst, src);
5342}
5343
5344void Assembler::xchgq(Register dst, Register src) {
5345  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
5346  emit_byte(0x87);
5347  emit_byte(0xc0 | encode);
5348}
5349
5350void Assembler::xorq(Register dst, Register src) {
5351  (void) prefixq_and_encode(dst->encoding(), src->encoding());
5352  emit_arith(0x33, 0xC0, dst, src);
5353}
5354
5355void Assembler::xorq(Register dst, Address src) {
5356  InstructionMark im(this);
5357  prefixq(src, dst);
5358  emit_byte(0x33);
5359  emit_operand(dst, src);
5360}
5361
5362#endif // !LP64
5363
5364static Assembler::Condition reverse[] = {
5365    Assembler::noOverflow     /* overflow      = 0x0 */ ,
5366    Assembler::overflow       /* noOverflow    = 0x1 */ ,
5367    Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
5368    Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
5369    Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
5370    Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
5371    Assembler::above          /* belowEqual    = 0x6 */ ,
5372    Assembler::belowEqual     /* above         = 0x7 */ ,
5373    Assembler::positive       /* negative      = 0x8 */ ,
5374    Assembler::negative       /* positive      = 0x9 */ ,
5375    Assembler::noParity       /* parity        = 0xa */ ,
5376    Assembler::parity         /* noParity      = 0xb */ ,
5377    Assembler::greaterEqual   /* less          = 0xc */ ,
5378    Assembler::less           /* greaterEqual  = 0xd */ ,
5379    Assembler::greater        /* lessEqual     = 0xe */ ,
5380    Assembler::lessEqual      /* greater       = 0xf, */
5381
5382};
5383
5384
5385// Implementation of MacroAssembler
5386
5387// First all the versions that have distinct versions depending on 32/64 bit
5388// Unless the difference is trivial (1 line or so).
5389
5390#ifndef _LP64
5391
5392// 32bit versions
5393
5394Address MacroAssembler::as_Address(AddressLiteral adr) {
5395  return Address(adr.target(), adr.rspec());
5396}
5397
5398Address MacroAssembler::as_Address(ArrayAddress adr) {
5399  return Address::make_array(adr);
5400}
5401
5402int MacroAssembler::biased_locking_enter(Register lock_reg,
5403                                         Register obj_reg,
5404                                         Register swap_reg,
5405                                         Register tmp_reg,
5406                                         bool swap_reg_contains_mark,
5407                                         Label& done,
5408                                         Label* slow_case,
5409                                         BiasedLockingCounters* counters) {
5410  assert(UseBiasedLocking, "why call this otherwise?");
5411  assert(swap_reg == rax, "swap_reg must be rax, for cmpxchg");
5412  assert_different_registers(lock_reg, obj_reg, swap_reg);
5413
5414  if (PrintBiasedLockingStatistics && counters == NULL)
5415    counters = BiasedLocking::counters();
5416
5417  bool need_tmp_reg = false;
5418  if (tmp_reg == noreg) {
5419    need_tmp_reg = true;
5420    tmp_reg = lock_reg;
5421  } else {
5422    assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
5423  }
5424  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
5425  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
5426  Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
5427  Address saved_mark_addr(lock_reg, 0);
5428
5429  // Biased locking
5430  // See whether the lock is currently biased toward our thread and
5431  // whether the epoch is still valid
5432  // Note that the runtime guarantees sufficient alignment of JavaThread
5433  // pointers to allow age to be placed into low bits
5434  // First check to see whether biasing is even enabled for this object
5435  Label cas_label;
5436  int null_check_offset = -1;
5437  if (!swap_reg_contains_mark) {
5438    null_check_offset = offset();
5439    movl(swap_reg, mark_addr);
5440  }
5441  if (need_tmp_reg) {
5442    push(tmp_reg);
5443  }
5444  movl(tmp_reg, swap_reg);
5445  andl(tmp_reg, markOopDesc::biased_lock_mask_in_place);
5446  cmpl(tmp_reg, markOopDesc::biased_lock_pattern);
5447  if (need_tmp_reg) {
5448    pop(tmp_reg);
5449  }
5450  jcc(Assembler::notEqual, cas_label);
5451  // The bias pattern is present in the object's header. Need to check
5452  // whether the bias owner and the epoch are both still current.
5453  // Note that because there is no current thread register on x86 we
5454  // need to store off the mark word we read out of the object to
5455  // avoid reloading it and needing to recheck invariants below. This
5456  // store is unfortunate but it makes the overall code shorter and
5457  // simpler.
5458  movl(saved_mark_addr, swap_reg);
5459  if (need_tmp_reg) {
5460    push(tmp_reg);
5461  }
5462  get_thread(tmp_reg);
5463  xorl(swap_reg, tmp_reg);
5464  if (swap_reg_contains_mark) {
5465    null_check_offset = offset();
5466  }
5467  movl(tmp_reg, klass_addr);
5468  xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset()));
5469  andl(swap_reg, ~((int) markOopDesc::age_mask_in_place));
5470  if (need_tmp_reg) {
5471    pop(tmp_reg);
5472  }
5473  if (counters != NULL) {
5474    cond_inc32(Assembler::zero,
5475               ExternalAddress((address)counters->biased_lock_entry_count_addr()));
5476  }
5477  jcc(Assembler::equal, done);
5478
5479  Label try_revoke_bias;
5480  Label try_rebias;
5481
5482  // At this point we know that the header has the bias pattern and
5483  // that we are not the bias owner in the current epoch. We need to
5484  // figure out more details about the state of the header in order to
5485  // know what operations can be legally performed on the object's
5486  // header.
5487
5488  // If the low three bits in the xor result aren't clear, that means
5489  // the prototype header is no longer biased and we have to revoke
5490  // the bias on this object.
5491  testl(swap_reg, markOopDesc::biased_lock_mask_in_place);
5492  jcc(Assembler::notZero, try_revoke_bias);
5493
5494  // Biasing is still enabled for this data type. See whether the
5495  // epoch of the current bias is still valid, meaning that the epoch
5496  // bits of the mark word are equal to the epoch bits of the
5497  // prototype header. (Note that the prototype header's epoch bits
5498  // only change at a safepoint.) If not, attempt to rebias the object
5499  // toward the current thread. Note that we must be absolutely sure
5500  // that the current epoch is invalid in order to do this because
5501  // otherwise the manipulations it performs on the mark word are
5502  // illegal.
5503  testl(swap_reg, markOopDesc::epoch_mask_in_place);
5504  jcc(Assembler::notZero, try_rebias);
5505
5506  // The epoch of the current bias is still valid but we know nothing
5507  // about the owner; it might be set or it might be clear. Try to
5508  // acquire the bias of the object using an atomic operation. If this
5509  // fails we will go in to the runtime to revoke the object's bias.
5510  // Note that we first construct the presumed unbiased header so we
5511  // don't accidentally blow away another thread's valid bias.
5512  movl(swap_reg, saved_mark_addr);
5513  andl(swap_reg,
5514       markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
5515  if (need_tmp_reg) {
5516    push(tmp_reg);
5517  }
5518  get_thread(tmp_reg);
5519  orl(tmp_reg, swap_reg);
5520  if (os::is_MP()) {
5521    lock();
5522  }
5523  cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5524  if (need_tmp_reg) {
5525    pop(tmp_reg);
5526  }
5527  // If the biasing toward our thread failed, this means that
5528  // another thread succeeded in biasing it toward itself and we
5529  // need to revoke that bias. The revocation will occur in the
5530  // interpreter runtime in the slow case.
5531  if (counters != NULL) {
5532    cond_inc32(Assembler::zero,
5533               ExternalAddress((address)counters->anonymously_biased_lock_entry_count_addr()));
5534  }
5535  if (slow_case != NULL) {
5536    jcc(Assembler::notZero, *slow_case);
5537  }
5538  jmp(done);
5539
5540  bind(try_rebias);
5541  // At this point we know the epoch has expired, meaning that the
5542  // current "bias owner", if any, is actually invalid. Under these
5543  // circumstances _only_, we are allowed to use the current header's
5544  // value as the comparison value when doing the cas to acquire the
5545  // bias in the current epoch. In other words, we allow transfer of
5546  // the bias from one thread to another directly in this situation.
5547  //
5548  // FIXME: due to a lack of registers we currently blow away the age
5549  // bits in this situation. Should attempt to preserve them.
5550  if (need_tmp_reg) {
5551    push(tmp_reg);
5552  }
5553  get_thread(tmp_reg);
5554  movl(swap_reg, klass_addr);
5555  orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset()));
5556  movl(swap_reg, saved_mark_addr);
5557  if (os::is_MP()) {
5558    lock();
5559  }
5560  cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5561  if (need_tmp_reg) {
5562    pop(tmp_reg);
5563  }
5564  // If the biasing toward our thread failed, then another thread
5565  // succeeded in biasing it toward itself and we need to revoke that
5566  // bias. The revocation will occur in the runtime in the slow case.
5567  if (counters != NULL) {
5568    cond_inc32(Assembler::zero,
5569               ExternalAddress((address)counters->rebiased_lock_entry_count_addr()));
5570  }
5571  if (slow_case != NULL) {
5572    jcc(Assembler::notZero, *slow_case);
5573  }
5574  jmp(done);
5575
5576  bind(try_revoke_bias);
5577  // The prototype mark in the klass doesn't have the bias bit set any
5578  // more, indicating that objects of this data type are not supposed
5579  // to be biased any more. We are going to try to reset the mark of
5580  // this object to the prototype value and fall through to the
5581  // CAS-based locking scheme. Note that if our CAS fails, it means
5582  // that another thread raced us for the privilege of revoking the
5583  // bias of this particular object, so it's okay to continue in the
5584  // normal locking code.
5585  //
5586  // FIXME: due to a lack of registers we currently blow away the age
5587  // bits in this situation. Should attempt to preserve them.
5588  movl(swap_reg, saved_mark_addr);
5589  if (need_tmp_reg) {
5590    push(tmp_reg);
5591  }
5592  movl(tmp_reg, klass_addr);
5593  movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset()));
5594  if (os::is_MP()) {
5595    lock();
5596  }
5597  cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5598  if (need_tmp_reg) {
5599    pop(tmp_reg);
5600  }
5601  // Fall through to the normal CAS-based lock, because no matter what
5602  // the result of the above CAS, some thread must have succeeded in
5603  // removing the bias bit from the object's header.
5604  if (counters != NULL) {
5605    cond_inc32(Assembler::zero,
5606               ExternalAddress((address)counters->revoked_lock_entry_count_addr()));
5607  }
5608
5609  bind(cas_label);
5610
5611  return null_check_offset;
5612}
5613void MacroAssembler::call_VM_leaf_base(address entry_point,
5614                                       int number_of_arguments) {
5615  call(RuntimeAddress(entry_point));
5616  increment(rsp, number_of_arguments * wordSize);
5617}
5618
5619void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
5620  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
5621}
5622
5623void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
5624  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
5625}
5626
5627void MacroAssembler::cmpoop(Address src1, jobject obj) {
5628  cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
5629}
5630
5631void MacroAssembler::cmpoop(Register src1, jobject obj) {
5632  cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
5633}
5634
5635void MacroAssembler::extend_sign(Register hi, Register lo) {
5636  // According to Intel Doc. AP-526, "Integer Divide", p.18.
5637  if (VM_Version::is_P6() && hi == rdx && lo == rax) {
5638    cdql();
5639  } else {
5640    movl(hi, lo);
5641    sarl(hi, 31);
5642  }
5643}
5644
5645void MacroAssembler::jC2(Register tmp, Label& L) {
5646  // set parity bit if FPU flag C2 is set (via rax)
5647  save_rax(tmp);
5648  fwait(); fnstsw_ax();
5649  sahf();
5650  restore_rax(tmp);
5651  // branch
5652  jcc(Assembler::parity, L);
5653}
5654
5655void MacroAssembler::jnC2(Register tmp, Label& L) {
5656  // set parity bit if FPU flag C2 is set (via rax)
5657  save_rax(tmp);
5658  fwait(); fnstsw_ax();
5659  sahf();
5660  restore_rax(tmp);
5661  // branch
5662  jcc(Assembler::noParity, L);
5663}
5664
5665// 32bit can do a case table jump in one instruction but we no longer allow the base
5666// to be installed in the Address class
5667void MacroAssembler::jump(ArrayAddress entry) {
5668  jmp(as_Address(entry));
5669}
5670
5671// Note: y_lo will be destroyed
5672void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
5673  // Long compare for Java (semantics as described in JVM spec.)
5674  Label high, low, done;
5675
5676  cmpl(x_hi, y_hi);
5677  jcc(Assembler::less, low);
5678  jcc(Assembler::greater, high);
5679  // x_hi is the return register
5680  xorl(x_hi, x_hi);
5681  cmpl(x_lo, y_lo);
5682  jcc(Assembler::below, low);
5683  jcc(Assembler::equal, done);
5684
5685  bind(high);
5686  xorl(x_hi, x_hi);
5687  increment(x_hi);
5688  jmp(done);
5689
5690  bind(low);
5691  xorl(x_hi, x_hi);
5692  decrementl(x_hi);
5693
5694  bind(done);
5695}
5696
5697void MacroAssembler::lea(Register dst, AddressLiteral src) {
5698    mov_literal32(dst, (int32_t)src.target(), src.rspec());
5699}
5700
5701void MacroAssembler::lea(Address dst, AddressLiteral adr) {
5702  // leal(dst, as_Address(adr));
5703  // see note in movl as to why we must use a move
5704  mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
5705}
5706
5707void MacroAssembler::leave() {
5708  mov(rsp, rbp);
5709  pop(rbp);
5710}
5711
5712void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
5713  // Multiplication of two Java long values stored on the stack
5714  // as illustrated below. Result is in rdx:rax.
5715  //
5716  // rsp ---> [  ??  ] \               \
5717  //            ....    | y_rsp_offset  |
5718  //          [ y_lo ] /  (in bytes)    | x_rsp_offset
5719  //          [ y_hi ]                  | (in bytes)
5720  //            ....                    |
5721  //          [ x_lo ]                 /
5722  //          [ x_hi ]
5723  //            ....
5724  //
5725  // Basic idea: lo(result) = lo(x_lo * y_lo)
5726  //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
5727  Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
5728  Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
5729  Label quick;
5730  // load x_hi, y_hi and check if quick
5731  // multiplication is possible
5732  movl(rbx, x_hi);
5733  movl(rcx, y_hi);
5734  movl(rax, rbx);
5735  orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
5736  jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
5737  // do full multiplication
5738  // 1st step
5739  mull(y_lo);                                    // x_hi * y_lo
5740  movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
5741  // 2nd step
5742  movl(rax, x_lo);
5743  mull(rcx);                                     // x_lo * y_hi
5744  addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
5745  // 3rd step
5746  bind(quick);                                   // note: rbx, = 0 if quick multiply!
5747  movl(rax, x_lo);
5748  mull(y_lo);                                    // x_lo * y_lo
5749  addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
5750}
5751
5752void MacroAssembler::lneg(Register hi, Register lo) {
5753  negl(lo);
5754  adcl(hi, 0);
5755  negl(hi);
5756}
5757
5758void MacroAssembler::lshl(Register hi, Register lo) {
5759  // Java shift left long support (semantics as described in JVM spec., p.305)
5760  // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
5761  // shift value is in rcx !
5762  assert(hi != rcx, "must not use rcx");
5763  assert(lo != rcx, "must not use rcx");
5764  const Register s = rcx;                        // shift count
5765  const int      n = BitsPerWord;
5766  Label L;
5767  andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
5768  cmpl(s, n);                                    // if (s < n)
5769  jcc(Assembler::less, L);                       // else (s >= n)
5770  movl(hi, lo);                                  // x := x << n
5771  xorl(lo, lo);
5772  // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
5773  bind(L);                                       // s (mod n) < n
5774  shldl(hi, lo);                                 // x := x << s
5775  shll(lo);
5776}
5777
5778
5779void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
5780  // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
5781  // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
5782  assert(hi != rcx, "must not use rcx");
5783  assert(lo != rcx, "must not use rcx");
5784  const Register s = rcx;                        // shift count
5785  const int      n = BitsPerWord;
5786  Label L;
5787  andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
5788  cmpl(s, n);                                    // if (s < n)
5789  jcc(Assembler::less, L);                       // else (s >= n)
5790  movl(lo, hi);                                  // x := x >> n
5791  if (sign_extension) sarl(hi, 31);
5792  else                xorl(hi, hi);
5793  // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
5794  bind(L);                                       // s (mod n) < n
5795  shrdl(lo, hi);                                 // x := x >> s
5796  if (sign_extension) sarl(hi);
5797  else                shrl(hi);
5798}
5799
5800void MacroAssembler::movoop(Register dst, jobject obj) {
5801  mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
5802}
5803
5804void MacroAssembler::movoop(Address dst, jobject obj) {
5805  mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
5806}
5807
5808void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
5809  mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
5810}
5811
5812void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
5813  mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
5814}
5815
5816void MacroAssembler::movptr(Register dst, AddressLiteral src) {
5817  if (src.is_lval()) {
5818    mov_literal32(dst, (intptr_t)src.target(), src.rspec());
5819  } else {
5820    movl(dst, as_Address(src));
5821  }
5822}
5823
5824void MacroAssembler::movptr(ArrayAddress dst, Register src) {
5825  movl(as_Address(dst), src);
5826}
5827
5828void MacroAssembler::movptr(Register dst, ArrayAddress src) {
5829  movl(dst, as_Address(src));
5830}
5831
5832// src should NEVER be a real pointer. Use AddressLiteral for true pointers
5833void MacroAssembler::movptr(Address dst, intptr_t src) {
5834  movl(dst, src);
5835}
5836
5837
5838void MacroAssembler::pop_callee_saved_registers() {
5839  pop(rcx);
5840  pop(rdx);
5841  pop(rdi);
5842  pop(rsi);
5843}
5844
5845void MacroAssembler::pop_fTOS() {
5846  fld_d(Address(rsp, 0));
5847  addl(rsp, 2 * wordSize);
5848}
5849
5850void MacroAssembler::push_callee_saved_registers() {
5851  push(rsi);
5852  push(rdi);
5853  push(rdx);
5854  push(rcx);
5855}
5856
5857void MacroAssembler::push_fTOS() {
5858  subl(rsp, 2 * wordSize);
5859  fstp_d(Address(rsp, 0));
5860}
5861
5862
5863void MacroAssembler::pushoop(jobject obj) {
5864  push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
5865}
5866
5867void MacroAssembler::pushklass(Metadata* obj) {
5868  push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
5869}
5870
5871void MacroAssembler::pushptr(AddressLiteral src) {
5872  if (src.is_lval()) {
5873    push_literal32((int32_t)src.target(), src.rspec());
5874  } else {
5875    pushl(as_Address(src));
5876  }
5877}
5878
5879void MacroAssembler::set_word_if_not_zero(Register dst) {
5880  xorl(dst, dst);
5881  set_byte_if_not_zero(dst);
5882}
5883
5884static void pass_arg0(MacroAssembler* masm, Register arg) {
5885  masm->push(arg);
5886}
5887
5888static void pass_arg1(MacroAssembler* masm, Register arg) {
5889  masm->push(arg);
5890}
5891
5892static void pass_arg2(MacroAssembler* masm, Register arg) {
5893  masm->push(arg);
5894}
5895
5896static void pass_arg3(MacroAssembler* masm, Register arg) {
5897  masm->push(arg);
5898}
5899
5900#ifndef PRODUCT
5901extern "C" void findpc(intptr_t x);
5902#endif
5903
5904void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
5905  // In order to get locks to work, we need to fake a in_VM state
5906  JavaThread* thread = JavaThread::current();
5907  JavaThreadState saved_state = thread->thread_state();
5908  thread->set_thread_state(_thread_in_vm);
5909  if (ShowMessageBoxOnError) {
5910    JavaThread* thread = JavaThread::current();
5911    JavaThreadState saved_state = thread->thread_state();
5912    thread->set_thread_state(_thread_in_vm);
5913    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
5914      ttyLocker ttyl;
5915      BytecodeCounter::print();
5916    }
5917    // To see where a verify_oop failed, get $ebx+40/X for this frame.
5918    // This is the value of eip which points to where verify_oop will return.
5919    if (os::message_box(msg, "Execution stopped, print registers?")) {
5920      print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
5921      BREAKPOINT;
5922    }
5923  } else {
5924    ttyLocker ttyl;
5925    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
5926  }
5927  // Don't assert holding the ttyLock
5928    assert(false, err_msg("DEBUG MESSAGE: %s", msg));
5929  ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
5930}
5931
5932void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
5933  ttyLocker ttyl;
5934  FlagSetting fs(Debugging, true);
5935  tty->print_cr("eip = 0x%08x", eip);
5936#ifndef PRODUCT
5937  if ((WizardMode || Verbose) && PrintMiscellaneous) {
5938    tty->cr();
5939    findpc(eip);
5940    tty->cr();
5941  }
5942#endif
5943#define PRINT_REG(rax) \
5944  { tty->print("%s = ", #rax); os::print_location(tty, rax); }
5945  PRINT_REG(rax);
5946  PRINT_REG(rbx);
5947  PRINT_REG(rcx);
5948  PRINT_REG(rdx);
5949  PRINT_REG(rdi);
5950  PRINT_REG(rsi);
5951  PRINT_REG(rbp);
5952  PRINT_REG(rsp);
5953#undef PRINT_REG
5954  // Print some words near top of staack.
5955  int* dump_sp = (int*) rsp;
5956  for (int col1 = 0; col1 < 8; col1++) {
5957    tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
5958    os::print_location(tty, *dump_sp++);
5959  }
5960  for (int row = 0; row < 16; row++) {
5961    tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
5962    for (int col = 0; col < 8; col++) {
5963      tty->print(" 0x%08x", *dump_sp++);
5964    }
5965    tty->cr();
5966  }
5967  // Print some instructions around pc:
5968  Disassembler::decode((address)eip-64, (address)eip);
5969  tty->print_cr("--------");
5970  Disassembler::decode((address)eip, (address)eip+32);
5971}
5972
5973void MacroAssembler::stop(const char* msg) {
5974  ExternalAddress message((address)msg);
5975  // push address of message
5976  pushptr(message.addr());
5977  { Label L; call(L, relocInfo::none); bind(L); }     // push eip
5978  pusha();                                            // push registers
5979  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
5980  hlt();
5981}
5982
5983void MacroAssembler::warn(const char* msg) {
5984  push_CPU_state();
5985
5986  ExternalAddress message((address) msg);
5987  // push address of message
5988  pushptr(message.addr());
5989
5990  call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
5991  addl(rsp, wordSize);       // discard argument
5992  pop_CPU_state();
5993}
5994
5995void MacroAssembler::print_state() {
5996  { Label L; call(L, relocInfo::none); bind(L); }     // push eip
5997  pusha();                                            // push registers
5998
5999  push_CPU_state();
6000  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
6001  pop_CPU_state();
6002
6003  popa();
6004  addl(rsp, wordSize);
6005}
6006
6007#else // _LP64
6008
6009// 64 bit versions
6010
6011Address MacroAssembler::as_Address(AddressLiteral adr) {
6012  // amd64 always does this as a pc-rel
6013  // we can be absolute or disp based on the instruction type
6014  // jmp/call are displacements others are absolute
6015  assert(!adr.is_lval(), "must be rval");
6016  assert(reachable(adr), "must be");
6017  return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
6018
6019}
6020
6021Address MacroAssembler::as_Address(ArrayAddress adr) {
6022  AddressLiteral base = adr.base();
6023  lea(rscratch1, base);
6024  Address index = adr.index();
6025  assert(index._disp == 0, "must not have disp"); // maybe it can?
6026  Address array(rscratch1, index._index, index._scale, index._disp);
6027  return array;
6028}
6029
6030int MacroAssembler::biased_locking_enter(Register lock_reg,
6031                                         Register obj_reg,
6032                                         Register swap_reg,
6033                                         Register tmp_reg,
6034                                         bool swap_reg_contains_mark,
6035                                         Label& done,
6036                                         Label* slow_case,
6037                                         BiasedLockingCounters* counters) {
6038  assert(UseBiasedLocking, "why call this otherwise?");
6039  assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
6040  assert(tmp_reg != noreg, "tmp_reg must be supplied");
6041  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
6042  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
6043  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
6044  Address saved_mark_addr(lock_reg, 0);
6045
6046  if (PrintBiasedLockingStatistics && counters == NULL)
6047    counters = BiasedLocking::counters();
6048
6049  // Biased locking
6050  // See whether the lock is currently biased toward our thread and
6051  // whether the epoch is still valid
6052  // Note that the runtime guarantees sufficient alignment of JavaThread
6053  // pointers to allow age to be placed into low bits
6054  // First check to see whether biasing is even enabled for this object
6055  Label cas_label;
6056  int null_check_offset = -1;
6057  if (!swap_reg_contains_mark) {
6058    null_check_offset = offset();
6059    movq(swap_reg, mark_addr);
6060  }
6061  movq(tmp_reg, swap_reg);
6062  andq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
6063  cmpq(tmp_reg, markOopDesc::biased_lock_pattern);
6064  jcc(Assembler::notEqual, cas_label);
6065  // The bias pattern is present in the object's header. Need to check
6066  // whether the bias owner and the epoch are both still current.
6067  load_prototype_header(tmp_reg, obj_reg);
6068  orq(tmp_reg, r15_thread);
6069  xorq(tmp_reg, swap_reg);
6070  andq(tmp_reg, ~((int) markOopDesc::age_mask_in_place));
6071  if (counters != NULL) {
6072    cond_inc32(Assembler::zero,
6073               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
6074  }
6075  jcc(Assembler::equal, done);
6076
6077  Label try_revoke_bias;
6078  Label try_rebias;
6079
6080  // At this point we know that the header has the bias pattern and
6081  // that we are not the bias owner in the current epoch. We need to
6082  // figure out more details about the state of the header in order to
6083  // know what operations can be legally performed on the object's
6084  // header.
6085
6086  // If the low three bits in the xor result aren't clear, that means
6087  // the prototype header is no longer biased and we have to revoke
6088  // the bias on this object.
6089  testq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
6090  jcc(Assembler::notZero, try_revoke_bias);
6091
6092  // Biasing is still enabled for this data type. See whether the
6093  // epoch of the current bias is still valid, meaning that the epoch
6094  // bits of the mark word are equal to the epoch bits of the
6095  // prototype header. (Note that the prototype header's epoch bits
6096  // only change at a safepoint.) If not, attempt to rebias the object
6097  // toward the current thread. Note that we must be absolutely sure
6098  // that the current epoch is invalid in order to do this because
6099  // otherwise the manipulations it performs on the mark word are
6100  // illegal.
6101  testq(tmp_reg, markOopDesc::epoch_mask_in_place);
6102  jcc(Assembler::notZero, try_rebias);
6103
6104  // The epoch of the current bias is still valid but we know nothing
6105  // about the owner; it might be set or it might be clear. Try to
6106  // acquire the bias of the object using an atomic operation. If this
6107  // fails we will go in to the runtime to revoke the object's bias.
6108  // Note that we first construct the presumed unbiased header so we
6109  // don't accidentally blow away another thread's valid bias.
6110  andq(swap_reg,
6111       markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
6112  movq(tmp_reg, swap_reg);
6113  orq(tmp_reg, r15_thread);
6114  if (os::is_MP()) {
6115    lock();
6116  }
6117  cmpxchgq(tmp_reg, Address(obj_reg, 0));
6118  // If the biasing toward our thread failed, this means that
6119  // another thread succeeded in biasing it toward itself and we
6120  // need to revoke that bias. The revocation will occur in the
6121  // interpreter runtime in the slow case.
6122  if (counters != NULL) {
6123    cond_inc32(Assembler::zero,
6124               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
6125  }
6126  if (slow_case != NULL) {
6127    jcc(Assembler::notZero, *slow_case);
6128  }
6129  jmp(done);
6130
6131  bind(try_rebias);
6132  // At this point we know the epoch has expired, meaning that the
6133  // current "bias owner", if any, is actually invalid. Under these
6134  // circumstances _only_, we are allowed to use the current header's
6135  // value as the comparison value when doing the cas to acquire the
6136  // bias in the current epoch. In other words, we allow transfer of
6137  // the bias from one thread to another directly in this situation.
6138  //
6139  // FIXME: due to a lack of registers we currently blow away the age
6140  // bits in this situation. Should attempt to preserve them.
6141  load_prototype_header(tmp_reg, obj_reg);
6142  orq(tmp_reg, r15_thread);
6143  if (os::is_MP()) {
6144    lock();
6145  }
6146  cmpxchgq(tmp_reg, Address(obj_reg, 0));
6147  // If the biasing toward our thread failed, then another thread
6148  // succeeded in biasing it toward itself and we need to revoke that
6149  // bias. The revocation will occur in the runtime in the slow case.
6150  if (counters != NULL) {
6151    cond_inc32(Assembler::zero,
6152               ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
6153  }
6154  if (slow_case != NULL) {
6155    jcc(Assembler::notZero, *slow_case);
6156  }
6157  jmp(done);
6158
6159  bind(try_revoke_bias);
6160  // The prototype mark in the klass doesn't have the bias bit set any
6161  // more, indicating that objects of this data type are not supposed
6162  // to be biased any more. We are going to try to reset the mark of
6163  // this object to the prototype value and fall through to the
6164  // CAS-based locking scheme. Note that if our CAS fails, it means
6165  // that another thread raced us for the privilege of revoking the
6166  // bias of this particular object, so it's okay to continue in the
6167  // normal locking code.
6168  //
6169  // FIXME: due to a lack of registers we currently blow away the age
6170  // bits in this situation. Should attempt to preserve them.
6171  load_prototype_header(tmp_reg, obj_reg);
6172  if (os::is_MP()) {
6173    lock();
6174  }
6175  cmpxchgq(tmp_reg, Address(obj_reg, 0));
6176  // Fall through to the normal CAS-based lock, because no matter what
6177  // the result of the above CAS, some thread must have succeeded in
6178  // removing the bias bit from the object's header.
6179  if (counters != NULL) {
6180    cond_inc32(Assembler::zero,
6181               ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
6182  }
6183
6184  bind(cas_label);
6185
6186  return null_check_offset;
6187}
6188
6189void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
6190  Label L, E;
6191
6192#ifdef _WIN64
6193  // Windows always allocates space for it's register args
6194  assert(num_args <= 4, "only register arguments supported");
6195  subq(rsp,  frame::arg_reg_save_area_bytes);
6196#endif
6197
6198  // Align stack if necessary
6199  testl(rsp, 15);
6200  jcc(Assembler::zero, L);
6201
6202  subq(rsp, 8);
6203  {
6204    call(RuntimeAddress(entry_point));
6205  }
6206  addq(rsp, 8);
6207  jmp(E);
6208
6209  bind(L);
6210  {
6211    call(RuntimeAddress(entry_point));
6212  }
6213
6214  bind(E);
6215
6216#ifdef _WIN64
6217  // restore stack pointer
6218  addq(rsp, frame::arg_reg_save_area_bytes);
6219#endif
6220
6221}
6222
6223void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
6224  assert(!src2.is_lval(), "should use cmpptr");
6225
6226  if (reachable(src2)) {
6227    cmpq(src1, as_Address(src2));
6228  } else {
6229    lea(rscratch1, src2);
6230    Assembler::cmpq(src1, Address(rscratch1, 0));
6231  }
6232}
6233
6234int MacroAssembler::corrected_idivq(Register reg) {
6235  // Full implementation of Java ldiv and lrem; checks for special
6236  // case as described in JVM spec., p.243 & p.271.  The function
6237  // returns the (pc) offset of the idivl instruction - may be needed
6238  // for implicit exceptions.
6239  //
6240  //         normal case                           special case
6241  //
6242  // input : rax: dividend                         min_long
6243  //         reg: divisor   (may not be eax/edx)   -1
6244  //
6245  // output: rax: quotient  (= rax idiv reg)       min_long
6246  //         rdx: remainder (= rax irem reg)       0
6247  assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
6248  static const int64_t min_long = 0x8000000000000000;
6249  Label normal_case, special_case;
6250
6251  // check for special case
6252  cmp64(rax, ExternalAddress((address) &min_long));
6253  jcc(Assembler::notEqual, normal_case);
6254  xorl(rdx, rdx); // prepare rdx for possible special case (where
6255                  // remainder = 0)
6256  cmpq(reg, -1);
6257  jcc(Assembler::equal, special_case);
6258
6259  // handle normal case
6260  bind(normal_case);
6261  cdqq();
6262  int idivq_offset = offset();
6263  idivq(reg);
6264
6265  // normal and special case exit
6266  bind(special_case);
6267
6268  return idivq_offset;
6269}
6270
6271void MacroAssembler::decrementq(Register reg, int value) {
6272  if (value == min_jint) { subq(reg, value); return; }
6273  if (value <  0) { incrementq(reg, -value); return; }
6274  if (value == 0) {                        ; return; }
6275  if (value == 1 && UseIncDec) { decq(reg) ; return; }
6276  /* else */      { subq(reg, value)       ; return; }
6277}
6278
6279void MacroAssembler::decrementq(Address dst, int value) {
6280  if (value == min_jint) { subq(dst, value); return; }
6281  if (value <  0) { incrementq(dst, -value); return; }
6282  if (value == 0) {                        ; return; }
6283  if (value == 1 && UseIncDec) { decq(dst) ; return; }
6284  /* else */      { subq(dst, value)       ; return; }
6285}
6286
6287void MacroAssembler::incrementq(Register reg, int value) {
6288  if (value == min_jint) { addq(reg, value); return; }
6289  if (value <  0) { decrementq(reg, -value); return; }
6290  if (value == 0) {                        ; return; }
6291  if (value == 1 && UseIncDec) { incq(reg) ; return; }
6292  /* else */      { addq(reg, value)       ; return; }
6293}
6294
6295void MacroAssembler::incrementq(Address dst, int value) {
6296  if (value == min_jint) { addq(dst, value); return; }
6297  if (value <  0) { decrementq(dst, -value); return; }
6298  if (value == 0) {                        ; return; }
6299  if (value == 1 && UseIncDec) { incq(dst) ; return; }
6300  /* else */      { addq(dst, value)       ; return; }
6301}
6302
6303// 32bit can do a case table jump in one instruction but we no longer allow the base
6304// to be installed in the Address class
6305void MacroAssembler::jump(ArrayAddress entry) {
6306  lea(rscratch1, entry.base());
6307  Address dispatch = entry.index();
6308  assert(dispatch._base == noreg, "must be");
6309  dispatch._base = rscratch1;
6310  jmp(dispatch);
6311}
6312
6313void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
6314  ShouldNotReachHere(); // 64bit doesn't use two regs
6315  cmpq(x_lo, y_lo);
6316}
6317
6318void MacroAssembler::lea(Register dst, AddressLiteral src) {
6319    mov_literal64(dst, (intptr_t)src.target(), src.rspec());
6320}
6321
6322void MacroAssembler::lea(Address dst, AddressLiteral adr) {
6323  mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
6324  movptr(dst, rscratch1);
6325}
6326
6327void MacroAssembler::leave() {
6328  // %%% is this really better? Why not on 32bit too?
6329  emit_byte(0xC9); // LEAVE
6330}
6331
6332void MacroAssembler::lneg(Register hi, Register lo) {
6333  ShouldNotReachHere(); // 64bit doesn't use two regs
6334  negq(lo);
6335}
6336
6337void MacroAssembler::movoop(Register dst, jobject obj) {
6338  mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
6339}
6340
6341void MacroAssembler::movoop(Address dst, jobject obj) {
6342  mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
6343  movq(dst, rscratch1);
6344}
6345
6346void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
6347  mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
6348}
6349
6350void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
6351  mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
6352  movq(dst, rscratch1);
6353}
6354
6355void MacroAssembler::movptr(Register dst, AddressLiteral src) {
6356  if (src.is_lval()) {
6357    mov_literal64(dst, (intptr_t)src.target(), src.rspec());
6358  } else {
6359    if (reachable(src)) {
6360      movq(dst, as_Address(src));
6361    } else {
6362      lea(rscratch1, src);
6363      movq(dst, Address(rscratch1,0));
6364    }
6365  }
6366}
6367
6368void MacroAssembler::movptr(ArrayAddress dst, Register src) {
6369  movq(as_Address(dst), src);
6370}
6371
6372void MacroAssembler::movptr(Register dst, ArrayAddress src) {
6373  movq(dst, as_Address(src));
6374}
6375
6376// src should NEVER be a real pointer. Use AddressLiteral for true pointers
6377void MacroAssembler::movptr(Address dst, intptr_t src) {
6378  mov64(rscratch1, src);
6379  movq(dst, rscratch1);
6380}
6381
6382// These are mostly for initializing NULL
6383void MacroAssembler::movptr(Address dst, int32_t src) {
6384  movslq(dst, src);
6385}
6386
6387void MacroAssembler::movptr(Register dst, int32_t src) {
6388  mov64(dst, (intptr_t)src);
6389}
6390
6391void MacroAssembler::pushoop(jobject obj) {
6392  movoop(rscratch1, obj);
6393  push(rscratch1);
6394}
6395
6396void MacroAssembler::pushklass(Metadata* obj) {
6397  mov_metadata(rscratch1, obj);
6398  push(rscratch1);
6399}
6400
6401void MacroAssembler::pushptr(AddressLiteral src) {
6402  lea(rscratch1, src);
6403  if (src.is_lval()) {
6404    push(rscratch1);
6405  } else {
6406    pushq(Address(rscratch1, 0));
6407  }
6408}
6409
6410void MacroAssembler::reset_last_Java_frame(bool clear_fp,
6411                                           bool clear_pc) {
6412  // we must set sp to zero to clear frame
6413  movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
6414  // must clear fp, so that compiled frames are not confused; it is
6415  // possible that we need it only for debugging
6416  if (clear_fp) {
6417    movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
6418  }
6419
6420  if (clear_pc) {
6421    movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
6422  }
6423}
6424
6425void MacroAssembler::set_last_Java_frame(Register last_java_sp,
6426                                         Register last_java_fp,
6427                                         address  last_java_pc) {
6428  // determine last_java_sp register
6429  if (!last_java_sp->is_valid()) {
6430    last_java_sp = rsp;
6431  }
6432
6433  // last_java_fp is optional
6434  if (last_java_fp->is_valid()) {
6435    movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
6436           last_java_fp);
6437  }
6438
6439  // last_java_pc is optional
6440  if (last_java_pc != NULL) {
6441    Address java_pc(r15_thread,
6442                    JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
6443    lea(rscratch1, InternalAddress(last_java_pc));
6444    movptr(java_pc, rscratch1);
6445  }
6446
6447  movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
6448}
6449
6450static void pass_arg0(MacroAssembler* masm, Register arg) {
6451  if (c_rarg0 != arg ) {
6452    masm->mov(c_rarg0, arg);
6453  }
6454}
6455
6456static void pass_arg1(MacroAssembler* masm, Register arg) {
6457  if (c_rarg1 != arg ) {
6458    masm->mov(c_rarg1, arg);
6459  }
6460}
6461
6462static void pass_arg2(MacroAssembler* masm, Register arg) {
6463  if (c_rarg2 != arg ) {
6464    masm->mov(c_rarg2, arg);
6465  }
6466}
6467
6468static void pass_arg3(MacroAssembler* masm, Register arg) {
6469  if (c_rarg3 != arg ) {
6470    masm->mov(c_rarg3, arg);
6471  }
6472}
6473
6474void MacroAssembler::stop(const char* msg) {
6475  address rip = pc();
6476  pusha(); // get regs on stack
6477  lea(c_rarg0, ExternalAddress((address) msg));
6478  lea(c_rarg1, InternalAddress(rip));
6479  movq(c_rarg2, rsp); // pass pointer to regs array
6480  andq(rsp, -16); // align stack as required by ABI
6481  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
6482  hlt();
6483}
6484
6485void MacroAssembler::warn(const char* msg) {
6486  push(rbp);
6487  movq(rbp, rsp);
6488  andq(rsp, -16);     // align stack as required by push_CPU_state and call
6489  push_CPU_state();   // keeps alignment at 16 bytes
6490  lea(c_rarg0, ExternalAddress((address) msg));
6491  call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0);
6492  pop_CPU_state();
6493  mov(rsp, rbp);
6494  pop(rbp);
6495}
6496
6497void MacroAssembler::print_state() {
6498  address rip = pc();
6499  pusha();            // get regs on stack
6500  push(rbp);
6501  movq(rbp, rsp);
6502  andq(rsp, -16);     // align stack as required by push_CPU_state and call
6503  push_CPU_state();   // keeps alignment at 16 bytes
6504
6505  lea(c_rarg0, InternalAddress(rip));
6506  lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
6507  call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
6508
6509  pop_CPU_state();
6510  mov(rsp, rbp);
6511  pop(rbp);
6512  popa();
6513}
6514
6515#ifndef PRODUCT
6516extern "C" void findpc(intptr_t x);
6517#endif
6518
6519void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
6520  // In order to get locks to work, we need to fake a in_VM state
6521  if (ShowMessageBoxOnError) {
6522    JavaThread* thread = JavaThread::current();
6523    JavaThreadState saved_state = thread->thread_state();
6524    thread->set_thread_state(_thread_in_vm);
6525#ifndef PRODUCT
6526    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
6527      ttyLocker ttyl;
6528      BytecodeCounter::print();
6529    }
6530#endif
6531    // To see where a verify_oop failed, get $ebx+40/X for this frame.
6532    // XXX correct this offset for amd64
6533    // This is the value of eip which points to where verify_oop will return.
6534    if (os::message_box(msg, "Execution stopped, print registers?")) {
6535      print_state64(pc, regs);
6536      BREAKPOINT;
6537      assert(false, "start up GDB");
6538    }
6539    ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
6540  } else {
6541    ttyLocker ttyl;
6542    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
6543                    msg);
6544    assert(false, err_msg("DEBUG MESSAGE: %s", msg));
6545  }
6546}
6547
6548void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
6549  ttyLocker ttyl;
6550  FlagSetting fs(Debugging, true);
6551  tty->print_cr("rip = 0x%016lx", pc);
6552#ifndef PRODUCT
6553  tty->cr();
6554  findpc(pc);
6555  tty->cr();
6556#endif
6557#define PRINT_REG(rax, value) \
6558  { tty->print("%s = ", #rax); os::print_location(tty, value); }
6559  PRINT_REG(rax, regs[15]);
6560  PRINT_REG(rbx, regs[12]);
6561  PRINT_REG(rcx, regs[14]);
6562  PRINT_REG(rdx, regs[13]);
6563  PRINT_REG(rdi, regs[8]);
6564  PRINT_REG(rsi, regs[9]);
6565  PRINT_REG(rbp, regs[10]);
6566  PRINT_REG(rsp, regs[11]);
6567  PRINT_REG(r8 , regs[7]);
6568  PRINT_REG(r9 , regs[6]);
6569  PRINT_REG(r10, regs[5]);
6570  PRINT_REG(r11, regs[4]);
6571  PRINT_REG(r12, regs[3]);
6572  PRINT_REG(r13, regs[2]);
6573  PRINT_REG(r14, regs[1]);
6574  PRINT_REG(r15, regs[0]);
6575#undef PRINT_REG
6576  // Print some words near top of staack.
6577  int64_t* rsp = (int64_t*) regs[11];
6578  int64_t* dump_sp = rsp;
6579  for (int col1 = 0; col1 < 8; col1++) {
6580    tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
6581    os::print_location(tty, *dump_sp++);
6582  }
6583  for (int row = 0; row < 25; row++) {
6584    tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
6585    for (int col = 0; col < 4; col++) {
6586      tty->print(" 0x%016lx", *dump_sp++);
6587    }
6588    tty->cr();
6589  }
6590  // Print some instructions around pc:
6591  Disassembler::decode((address)pc-64, (address)pc);
6592  tty->print_cr("--------");
6593  Disassembler::decode((address)pc, (address)pc+32);
6594}
6595
6596#endif // _LP64
6597
6598// Now versions that are common to 32/64 bit
6599
6600void MacroAssembler::addptr(Register dst, int32_t imm32) {
6601  LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
6602}
6603
6604void MacroAssembler::addptr(Register dst, Register src) {
6605  LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
6606}
6607
6608void MacroAssembler::addptr(Address dst, Register src) {
6609  LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
6610}
6611
6612void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
6613  if (reachable(src)) {
6614    Assembler::addsd(dst, as_Address(src));
6615  } else {
6616    lea(rscratch1, src);
6617    Assembler::addsd(dst, Address(rscratch1, 0));
6618  }
6619}
6620
6621void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
6622  if (reachable(src)) {
6623    addss(dst, as_Address(src));
6624  } else {
6625    lea(rscratch1, src);
6626    addss(dst, Address(rscratch1, 0));
6627  }
6628}
6629
6630void MacroAssembler::align(int modulus) {
6631  if (offset() % modulus != 0) {
6632    nop(modulus - (offset() % modulus));
6633  }
6634}
6635
6636void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
6637  // Used in sign-masking with aligned address.
6638  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
6639  if (reachable(src)) {
6640    Assembler::andpd(dst, as_Address(src));
6641  } else {
6642    lea(rscratch1, src);
6643    Assembler::andpd(dst, Address(rscratch1, 0));
6644  }
6645}
6646
6647void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
6648  // Used in sign-masking with aligned address.
6649  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
6650  if (reachable(src)) {
6651    Assembler::andps(dst, as_Address(src));
6652  } else {
6653    lea(rscratch1, src);
6654    Assembler::andps(dst, Address(rscratch1, 0));
6655  }
6656}
6657
6658void MacroAssembler::andptr(Register dst, int32_t imm32) {
6659  LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
6660}
6661
6662void MacroAssembler::atomic_incl(AddressLiteral counter_addr) {
6663  pushf();
6664  if (os::is_MP())
6665    lock();
6666  incrementl(counter_addr);
6667  popf();
6668}
6669
6670// Writes to stack successive pages until offset reached to check for
6671// stack overflow + shadow pages.  This clobbers tmp.
6672void MacroAssembler::bang_stack_size(Register size, Register tmp) {
6673  movptr(tmp, rsp);
6674  // Bang stack for total size given plus shadow page size.
6675  // Bang one page at a time because large size can bang beyond yellow and
6676  // red zones.
6677  Label loop;
6678  bind(loop);
6679  movl(Address(tmp, (-os::vm_page_size())), size );
6680  subptr(tmp, os::vm_page_size());
6681  subl(size, os::vm_page_size());
6682  jcc(Assembler::greater, loop);
6683
6684  // Bang down shadow pages too.
6685  // The -1 because we already subtracted 1 page.
6686  for (int i = 0; i< StackShadowPages-1; i++) {
6687    // this could be any sized move but this is can be a debugging crumb
6688    // so the bigger the better.
6689    movptr(Address(tmp, (-i*os::vm_page_size())), size );
6690  }
6691}
6692
6693void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
6694  assert(UseBiasedLocking, "why call this otherwise?");
6695
6696  // Check for biased locking unlock case, which is a no-op
6697  // Note: we do not have to check the thread ID for two reasons.
6698  // First, the interpreter checks for IllegalMonitorStateException at
6699  // a higher level. Second, if the bias was revoked while we held the
6700  // lock, the object could not be rebiased toward another thread, so
6701  // the bias bit would be clear.
6702  movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
6703  andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
6704  cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
6705  jcc(Assembler::equal, done);
6706}
6707
6708void MacroAssembler::c2bool(Register x) {
6709  // implements x == 0 ? 0 : 1
6710  // note: must only look at least-significant byte of x
6711  //       since C-style booleans are stored in one byte
6712  //       only! (was bug)
6713  andl(x, 0xFF);
6714  setb(Assembler::notZero, x);
6715}
6716
6717// Wouldn't need if AddressLiteral version had new name
6718void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
6719  Assembler::call(L, rtype);
6720}
6721
6722void MacroAssembler::call(Register entry) {
6723  Assembler::call(entry);
6724}
6725
6726void MacroAssembler::call(AddressLiteral entry) {
6727  if (reachable(entry)) {
6728    Assembler::call_literal(entry.target(), entry.rspec());
6729  } else {
6730    lea(rscratch1, entry);
6731    Assembler::call(rscratch1);
6732  }
6733}
6734
6735void MacroAssembler::ic_call(address entry) {
6736  RelocationHolder rh = virtual_call_Relocation::spec(pc());
6737  movptr(rax, (intptr_t)Universe::non_oop_word());
6738  call(AddressLiteral(entry, rh));
6739}
6740
6741// Implementation of call_VM versions
6742
6743void MacroAssembler::call_VM(Register oop_result,
6744                             address entry_point,
6745                             bool check_exceptions) {
6746  Label C, E;
6747  call(C, relocInfo::none);
6748  jmp(E);
6749
6750  bind(C);
6751  call_VM_helper(oop_result, entry_point, 0, check_exceptions);
6752  ret(0);
6753
6754  bind(E);
6755}
6756
6757void MacroAssembler::call_VM(Register oop_result,
6758                             address entry_point,
6759                             Register arg_1,
6760                             bool check_exceptions) {
6761  Label C, E;
6762  call(C, relocInfo::none);
6763  jmp(E);
6764
6765  bind(C);
6766  pass_arg1(this, arg_1);
6767  call_VM_helper(oop_result, entry_point, 1, check_exceptions);
6768  ret(0);
6769
6770  bind(E);
6771}
6772
6773void MacroAssembler::call_VM(Register oop_result,
6774                             address entry_point,
6775                             Register arg_1,
6776                             Register arg_2,
6777                             bool check_exceptions) {
6778  Label C, E;
6779  call(C, relocInfo::none);
6780  jmp(E);
6781
6782  bind(C);
6783
6784  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6785
6786  pass_arg2(this, arg_2);
6787  pass_arg1(this, arg_1);
6788  call_VM_helper(oop_result, entry_point, 2, check_exceptions);
6789  ret(0);
6790
6791  bind(E);
6792}
6793
6794void MacroAssembler::call_VM(Register oop_result,
6795                             address entry_point,
6796                             Register arg_1,
6797                             Register arg_2,
6798                             Register arg_3,
6799                             bool check_exceptions) {
6800  Label C, E;
6801  call(C, relocInfo::none);
6802  jmp(E);
6803
6804  bind(C);
6805
6806  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6807  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6808  pass_arg3(this, arg_3);
6809
6810  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6811  pass_arg2(this, arg_2);
6812
6813  pass_arg1(this, arg_1);
6814  call_VM_helper(oop_result, entry_point, 3, check_exceptions);
6815  ret(0);
6816
6817  bind(E);
6818}
6819
6820void MacroAssembler::call_VM(Register oop_result,
6821                             Register last_java_sp,
6822                             address entry_point,
6823                             int number_of_arguments,
6824                             bool check_exceptions) {
6825  Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
6826  call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
6827}
6828
6829void MacroAssembler::call_VM(Register oop_result,
6830                             Register last_java_sp,
6831                             address entry_point,
6832                             Register arg_1,
6833                             bool check_exceptions) {
6834  pass_arg1(this, arg_1);
6835  call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
6836}
6837
6838void MacroAssembler::call_VM(Register oop_result,
6839                             Register last_java_sp,
6840                             address entry_point,
6841                             Register arg_1,
6842                             Register arg_2,
6843                             bool check_exceptions) {
6844
6845  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6846  pass_arg2(this, arg_2);
6847  pass_arg1(this, arg_1);
6848  call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
6849}
6850
6851void MacroAssembler::call_VM(Register oop_result,
6852                             Register last_java_sp,
6853                             address entry_point,
6854                             Register arg_1,
6855                             Register arg_2,
6856                             Register arg_3,
6857                             bool check_exceptions) {
6858  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6859  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6860  pass_arg3(this, arg_3);
6861  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6862  pass_arg2(this, arg_2);
6863  pass_arg1(this, arg_1);
6864  call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
6865}
6866
6867void MacroAssembler::super_call_VM(Register oop_result,
6868                                   Register last_java_sp,
6869                                   address entry_point,
6870                                   int number_of_arguments,
6871                                   bool check_exceptions) {
6872  Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
6873  MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
6874}
6875
6876void MacroAssembler::super_call_VM(Register oop_result,
6877                                   Register last_java_sp,
6878                                   address entry_point,
6879                                   Register arg_1,
6880                                   bool check_exceptions) {
6881  pass_arg1(this, arg_1);
6882  super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
6883}
6884
6885void MacroAssembler::super_call_VM(Register oop_result,
6886                                   Register last_java_sp,
6887                                   address entry_point,
6888                                   Register arg_1,
6889                                   Register arg_2,
6890                                   bool check_exceptions) {
6891
6892  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6893  pass_arg2(this, arg_2);
6894  pass_arg1(this, arg_1);
6895  super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
6896}
6897
6898void MacroAssembler::super_call_VM(Register oop_result,
6899                                   Register last_java_sp,
6900                                   address entry_point,
6901                                   Register arg_1,
6902                                   Register arg_2,
6903                                   Register arg_3,
6904                                   bool check_exceptions) {
6905  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6906  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6907  pass_arg3(this, arg_3);
6908  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6909  pass_arg2(this, arg_2);
6910  pass_arg1(this, arg_1);
6911  super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
6912}
6913
6914void MacroAssembler::call_VM_base(Register oop_result,
6915                                  Register java_thread,
6916                                  Register last_java_sp,
6917                                  address  entry_point,
6918                                  int      number_of_arguments,
6919                                  bool     check_exceptions) {
6920  // determine java_thread register
6921  if (!java_thread->is_valid()) {
6922#ifdef _LP64
6923    java_thread = r15_thread;
6924#else
6925    java_thread = rdi;
6926    get_thread(java_thread);
6927#endif // LP64
6928  }
6929  // determine last_java_sp register
6930  if (!last_java_sp->is_valid()) {
6931    last_java_sp = rsp;
6932  }
6933  // debugging support
6934  assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
6935  LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
6936#ifdef ASSERT
6937  // TraceBytecodes does not use r12 but saves it over the call, so don't verify
6938  // r12 is the heapbase.
6939  LP64_ONLY(if ((UseCompressedOops || UseCompressedKlassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
6940#endif // ASSERT
6941
6942  assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
6943  assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
6944
6945  // push java thread (becomes first argument of C function)
6946
6947  NOT_LP64(push(java_thread); number_of_arguments++);
6948  LP64_ONLY(mov(c_rarg0, r15_thread));
6949
6950  // set last Java frame before call
6951  assert(last_java_sp != rbp, "can't use ebp/rbp");
6952
6953  // Only interpreter should have to set fp
6954  set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
6955
6956  // do the call, remove parameters
6957  MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
6958
6959  // restore the thread (cannot use the pushed argument since arguments
6960  // may be overwritten by C code generated by an optimizing compiler);
6961  // however can use the register value directly if it is callee saved.
6962  if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
6963    // rdi & rsi (also r15) are callee saved -> nothing to do
6964#ifdef ASSERT
6965    guarantee(java_thread != rax, "change this code");
6966    push(rax);
6967    { Label L;
6968      get_thread(rax);
6969      cmpptr(java_thread, rax);
6970      jcc(Assembler::equal, L);
6971      STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
6972      bind(L);
6973    }
6974    pop(rax);
6975#endif
6976  } else {
6977    get_thread(java_thread);
6978  }
6979  // reset last Java frame
6980  // Only interpreter should have to clear fp
6981  reset_last_Java_frame(java_thread, true, false);
6982
6983#ifndef CC_INTERP
6984   // C++ interp handles this in the interpreter
6985  check_and_handle_popframe(java_thread);
6986  check_and_handle_earlyret(java_thread);
6987#endif /* CC_INTERP */
6988
6989  if (check_exceptions) {
6990    // check for pending exceptions (java_thread is set upon return)
6991    cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
6992#ifndef _LP64
6993    jump_cc(Assembler::notEqual,
6994            RuntimeAddress(StubRoutines::forward_exception_entry()));
6995#else
6996    // This used to conditionally jump to forward_exception however it is
6997    // possible if we relocate that the branch will not reach. So we must jump
6998    // around so we can always reach
6999
7000    Label ok;
7001    jcc(Assembler::equal, ok);
7002    jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7003    bind(ok);
7004#endif // LP64
7005  }
7006
7007  // get oop result if there is one and reset the value in the thread
7008  if (oop_result->is_valid()) {
7009    get_vm_result(oop_result, java_thread);
7010  }
7011}
7012
7013void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
7014
7015  // Calculate the value for last_Java_sp
7016  // somewhat subtle. call_VM does an intermediate call
7017  // which places a return address on the stack just under the
7018  // stack pointer as the user finsihed with it. This allows
7019  // use to retrieve last_Java_pc from last_Java_sp[-1].
7020  // On 32bit we then have to push additional args on the stack to accomplish
7021  // the actual requested call. On 64bit call_VM only can use register args
7022  // so the only extra space is the return address that call_VM created.
7023  // This hopefully explains the calculations here.
7024
7025#ifdef _LP64
7026  // We've pushed one address, correct last_Java_sp
7027  lea(rax, Address(rsp, wordSize));
7028#else
7029  lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
7030#endif // LP64
7031
7032  call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
7033
7034}
7035
7036void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
7037  call_VM_leaf_base(entry_point, number_of_arguments);
7038}
7039
7040void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
7041  pass_arg0(this, arg_0);
7042  call_VM_leaf(entry_point, 1);
7043}
7044
7045void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
7046
7047  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
7048  pass_arg1(this, arg_1);
7049  pass_arg0(this, arg_0);
7050  call_VM_leaf(entry_point, 2);
7051}
7052
7053void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
7054  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
7055  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
7056  pass_arg2(this, arg_2);
7057  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
7058  pass_arg1(this, arg_1);
7059  pass_arg0(this, arg_0);
7060  call_VM_leaf(entry_point, 3);
7061}
7062
7063void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
7064  pass_arg0(this, arg_0);
7065  MacroAssembler::call_VM_leaf_base(entry_point, 1);
7066}
7067
7068void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
7069
7070  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
7071  pass_arg1(this, arg_1);
7072  pass_arg0(this, arg_0);
7073  MacroAssembler::call_VM_leaf_base(entry_point, 2);
7074}
7075
7076void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
7077  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
7078  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
7079  pass_arg2(this, arg_2);
7080  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
7081  pass_arg1(this, arg_1);
7082  pass_arg0(this, arg_0);
7083  MacroAssembler::call_VM_leaf_base(entry_point, 3);
7084}
7085
7086void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
7087  LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
7088  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
7089  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
7090  pass_arg3(this, arg_3);
7091  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
7092  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
7093  pass_arg2(this, arg_2);
7094  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
7095  pass_arg1(this, arg_1);
7096  pass_arg0(this, arg_0);
7097  MacroAssembler::call_VM_leaf_base(entry_point, 4);
7098}
7099
7100void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
7101  movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
7102  movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
7103  verify_oop(oop_result, "broken oop in call_VM_base");
7104}
7105
7106void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
7107  movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
7108  movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
7109}
7110
7111void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
7112}
7113
7114void MacroAssembler::check_and_handle_popframe(Register java_thread) {
7115}
7116
7117void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
7118  if (reachable(src1)) {
7119    cmpl(as_Address(src1), imm);
7120  } else {
7121    lea(rscratch1, src1);
7122    cmpl(Address(rscratch1, 0), imm);
7123  }
7124}
7125
7126void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
7127  assert(!src2.is_lval(), "use cmpptr");
7128  if (reachable(src2)) {
7129    cmpl(src1, as_Address(src2));
7130  } else {
7131    lea(rscratch1, src2);
7132    cmpl(src1, Address(rscratch1, 0));
7133  }
7134}
7135
7136void MacroAssembler::cmp32(Register src1, int32_t imm) {
7137  Assembler::cmpl(src1, imm);
7138}
7139
7140void MacroAssembler::cmp32(Register src1, Address src2) {
7141  Assembler::cmpl(src1, src2);
7142}
7143
7144void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
7145  ucomisd(opr1, opr2);
7146
7147  Label L;
7148  if (unordered_is_less) {
7149    movl(dst, -1);
7150    jcc(Assembler::parity, L);
7151    jcc(Assembler::below , L);
7152    movl(dst, 0);
7153    jcc(Assembler::equal , L);
7154    increment(dst);
7155  } else { // unordered is greater
7156    movl(dst, 1);
7157    jcc(Assembler::parity, L);
7158    jcc(Assembler::above , L);
7159    movl(dst, 0);
7160    jcc(Assembler::equal , L);
7161    decrementl(dst);
7162  }
7163  bind(L);
7164}
7165
7166void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
7167  ucomiss(opr1, opr2);
7168
7169  Label L;
7170  if (unordered_is_less) {
7171    movl(dst, -1);
7172    jcc(Assembler::parity, L);
7173    jcc(Assembler::below , L);
7174    movl(dst, 0);
7175    jcc(Assembler::equal , L);
7176    increment(dst);
7177  } else { // unordered is greater
7178    movl(dst, 1);
7179    jcc(Assembler::parity, L);
7180    jcc(Assembler::above , L);
7181    movl(dst, 0);
7182    jcc(Assembler::equal , L);
7183    decrementl(dst);
7184  }
7185  bind(L);
7186}
7187
7188
7189void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
7190  if (reachable(src1)) {
7191    cmpb(as_Address(src1), imm);
7192  } else {
7193    lea(rscratch1, src1);
7194    cmpb(Address(rscratch1, 0), imm);
7195  }
7196}
7197
7198void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
7199#ifdef _LP64
7200  if (src2.is_lval()) {
7201    movptr(rscratch1, src2);
7202    Assembler::cmpq(src1, rscratch1);
7203  } else if (reachable(src2)) {
7204    cmpq(src1, as_Address(src2));
7205  } else {
7206    lea(rscratch1, src2);
7207    Assembler::cmpq(src1, Address(rscratch1, 0));
7208  }
7209#else
7210  if (src2.is_lval()) {
7211    cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
7212  } else {
7213    cmpl(src1, as_Address(src2));
7214  }
7215#endif // _LP64
7216}
7217
7218void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
7219  assert(src2.is_lval(), "not a mem-mem compare");
7220#ifdef _LP64
7221  // moves src2's literal address
7222  movptr(rscratch1, src2);
7223  Assembler::cmpq(src1, rscratch1);
7224#else
7225  cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
7226#endif // _LP64
7227}
7228
7229void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
7230  if (reachable(adr)) {
7231    if (os::is_MP())
7232      lock();
7233    cmpxchgptr(reg, as_Address(adr));
7234  } else {
7235    lea(rscratch1, adr);
7236    if (os::is_MP())
7237      lock();
7238    cmpxchgptr(reg, Address(rscratch1, 0));
7239  }
7240}
7241
7242void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
7243  LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
7244}
7245
7246void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
7247  if (reachable(src)) {
7248    Assembler::comisd(dst, as_Address(src));
7249  } else {
7250    lea(rscratch1, src);
7251    Assembler::comisd(dst, Address(rscratch1, 0));
7252  }
7253}
7254
7255void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
7256  if (reachable(src)) {
7257    Assembler::comiss(dst, as_Address(src));
7258  } else {
7259    lea(rscratch1, src);
7260    Assembler::comiss(dst, Address(rscratch1, 0));
7261  }
7262}
7263
7264
7265void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
7266  Condition negated_cond = negate_condition(cond);
7267  Label L;
7268  jcc(negated_cond, L);
7269  atomic_incl(counter_addr);
7270  bind(L);
7271}
7272
7273int MacroAssembler::corrected_idivl(Register reg) {
7274  // Full implementation of Java idiv and irem; checks for
7275  // special case as described in JVM spec., p.243 & p.271.
7276  // The function returns the (pc) offset of the idivl
7277  // instruction - may be needed for implicit exceptions.
7278  //
7279  //         normal case                           special case
7280  //
7281  // input : rax,: dividend                         min_int
7282  //         reg: divisor   (may not be rax,/rdx)   -1
7283  //
7284  // output: rax,: quotient  (= rax, idiv reg)       min_int
7285  //         rdx: remainder (= rax, irem reg)       0
7286  assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
7287  const int min_int = 0x80000000;
7288  Label normal_case, special_case;
7289
7290  // check for special case
7291  cmpl(rax, min_int);
7292  jcc(Assembler::notEqual, normal_case);
7293  xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
7294  cmpl(reg, -1);
7295  jcc(Assembler::equal, special_case);
7296
7297  // handle normal case
7298  bind(normal_case);
7299  cdql();
7300  int idivl_offset = offset();
7301  idivl(reg);
7302
7303  // normal and special case exit
7304  bind(special_case);
7305
7306  return idivl_offset;
7307}
7308
7309
7310
7311void MacroAssembler::decrementl(Register reg, int value) {
7312  if (value == min_jint) {subl(reg, value) ; return; }
7313  if (value <  0) { incrementl(reg, -value); return; }
7314  if (value == 0) {                        ; return; }
7315  if (value == 1 && UseIncDec) { decl(reg) ; return; }
7316  /* else */      { subl(reg, value)       ; return; }
7317}
7318
7319void MacroAssembler::decrementl(Address dst, int value) {
7320  if (value == min_jint) {subl(dst, value) ; return; }
7321  if (value <  0) { incrementl(dst, -value); return; }
7322  if (value == 0) {                        ; return; }
7323  if (value == 1 && UseIncDec) { decl(dst) ; return; }
7324  /* else */      { subl(dst, value)       ; return; }
7325}
7326
7327void MacroAssembler::division_with_shift (Register reg, int shift_value) {
7328  assert (shift_value > 0, "illegal shift value");
7329  Label _is_positive;
7330  testl (reg, reg);
7331  jcc (Assembler::positive, _is_positive);
7332  int offset = (1 << shift_value) - 1 ;
7333
7334  if (offset == 1) {
7335    incrementl(reg);
7336  } else {
7337    addl(reg, offset);
7338  }
7339
7340  bind (_is_positive);
7341  sarl(reg, shift_value);
7342}
7343
7344void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
7345  if (reachable(src)) {
7346    Assembler::divsd(dst, as_Address(src));
7347  } else {
7348    lea(rscratch1, src);
7349    Assembler::divsd(dst, Address(rscratch1, 0));
7350  }
7351}
7352
7353void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
7354  if (reachable(src)) {
7355    Assembler::divss(dst, as_Address(src));
7356  } else {
7357    lea(rscratch1, src);
7358    Assembler::divss(dst, Address(rscratch1, 0));
7359  }
7360}
7361
7362// !defined(COMPILER2) is because of stupid core builds
7363#if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2)
7364void MacroAssembler::empty_FPU_stack() {
7365  if (VM_Version::supports_mmx()) {
7366    emms();
7367  } else {
7368    for (int i = 8; i-- > 0; ) ffree(i);
7369  }
7370}
7371#endif // !LP64 || C1 || !C2
7372
7373
7374// Defines obj, preserves var_size_in_bytes
7375void MacroAssembler::eden_allocate(Register obj,
7376                                   Register var_size_in_bytes,
7377                                   int con_size_in_bytes,
7378                                   Register t1,
7379                                   Label& slow_case) {
7380  assert(obj == rax, "obj must be in rax, for cmpxchg");
7381  assert_different_registers(obj, var_size_in_bytes, t1);
7382  if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
7383    jmp(slow_case);
7384  } else {
7385    Register end = t1;
7386    Label retry;
7387    bind(retry);
7388    ExternalAddress heap_top((address) Universe::heap()->top_addr());
7389    movptr(obj, heap_top);
7390    if (var_size_in_bytes == noreg) {
7391      lea(end, Address(obj, con_size_in_bytes));
7392    } else {
7393      lea(end, Address(obj, var_size_in_bytes, Address::times_1));
7394    }
7395    // if end < obj then we wrapped around => object too long => slow case
7396    cmpptr(end, obj);
7397    jcc(Assembler::below, slow_case);
7398    cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
7399    jcc(Assembler::above, slow_case);
7400    // Compare obj with the top addr, and if still equal, store the new top addr in
7401    // end at the address of the top addr pointer. Sets ZF if was equal, and clears
7402    // it otherwise. Use lock prefix for atomicity on MPs.
7403    locked_cmpxchgptr(end, heap_top);
7404    jcc(Assembler::notEqual, retry);
7405  }
7406}
7407
7408void MacroAssembler::enter() {
7409  push(rbp);
7410  mov(rbp, rsp);
7411}
7412
7413// A 5 byte nop that is safe for patching (see patch_verified_entry)
7414void MacroAssembler::fat_nop() {
7415  if (UseAddressNop) {
7416    addr_nop_5();
7417  } else {
7418    emit_byte(0x26); // es:
7419    emit_byte(0x2e); // cs:
7420    emit_byte(0x64); // fs:
7421    emit_byte(0x65); // gs:
7422    emit_byte(0x90);
7423  }
7424}
7425
7426void MacroAssembler::fcmp(Register tmp) {
7427  fcmp(tmp, 1, true, true);
7428}
7429
7430void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
7431  assert(!pop_right || pop_left, "usage error");
7432  if (VM_Version::supports_cmov()) {
7433    assert(tmp == noreg, "unneeded temp");
7434    if (pop_left) {
7435      fucomip(index);
7436    } else {
7437      fucomi(index);
7438    }
7439    if (pop_right) {
7440      fpop();
7441    }
7442  } else {
7443    assert(tmp != noreg, "need temp");
7444    if (pop_left) {
7445      if (pop_right) {
7446        fcompp();
7447      } else {
7448        fcomp(index);
7449      }
7450    } else {
7451      fcom(index);
7452    }
7453    // convert FPU condition into eflags condition via rax,
7454    save_rax(tmp);
7455    fwait(); fnstsw_ax();
7456    sahf();
7457    restore_rax(tmp);
7458  }
7459  // condition codes set as follows:
7460  //
7461  // CF (corresponds to C0) if x < y
7462  // PF (corresponds to C2) if unordered
7463  // ZF (corresponds to C3) if x = y
7464}
7465
7466void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
7467  fcmp2int(dst, unordered_is_less, 1, true, true);
7468}
7469
7470void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
7471  fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
7472  Label L;
7473  if (unordered_is_less) {
7474    movl(dst, -1);
7475    jcc(Assembler::parity, L);
7476    jcc(Assembler::below , L);
7477    movl(dst, 0);
7478    jcc(Assembler::equal , L);
7479    increment(dst);
7480  } else { // unordered is greater
7481    movl(dst, 1);
7482    jcc(Assembler::parity, L);
7483    jcc(Assembler::above , L);
7484    movl(dst, 0);
7485    jcc(Assembler::equal , L);
7486    decrementl(dst);
7487  }
7488  bind(L);
7489}
7490
7491void MacroAssembler::fld_d(AddressLiteral src) {
7492  fld_d(as_Address(src));
7493}
7494
7495void MacroAssembler::fld_s(AddressLiteral src) {
7496  fld_s(as_Address(src));
7497}
7498
7499void MacroAssembler::fld_x(AddressLiteral src) {
7500  Assembler::fld_x(as_Address(src));
7501}
7502
7503void MacroAssembler::fldcw(AddressLiteral src) {
7504  Assembler::fldcw(as_Address(src));
7505}
7506
7507void MacroAssembler::pow_exp_core_encoding() {
7508  // kills rax, rcx, rdx
7509  subptr(rsp,sizeof(jdouble));
7510  // computes 2^X. Stack: X ...
7511  // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
7512  // keep it on the thread's stack to compute 2^int(X) later
7513  // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
7514  // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
7515  fld_s(0);                 // Stack: X X ...
7516  frndint();                // Stack: int(X) X ...
7517  fsuba(1);                 // Stack: int(X) X-int(X) ...
7518  fistp_s(Address(rsp,0));  // move int(X) as integer to thread's stack. Stack: X-int(X) ...
7519  f2xm1();                  // Stack: 2^(X-int(X))-1 ...
7520  fld1();                   // Stack: 1 2^(X-int(X))-1 ...
7521  faddp(1);                 // Stack: 2^(X-int(X))
7522  // computes 2^(int(X)): add exponent bias (1023) to int(X), then
7523  // shift int(X)+1023 to exponent position.
7524  // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
7525  // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
7526  // values so detect them and set result to NaN.
7527  movl(rax,Address(rsp,0));
7528  movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
7529  addl(rax, 1023);
7530  movl(rdx,rax);
7531  shll(rax,20);
7532  // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
7533  addl(rdx,1);
7534  // Check that 1 < int(X)+1023+1 < 2048
7535  // in 3 steps:
7536  // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
7537  // 2- (int(X)+1023+1)&-2048 != 0
7538  // 3- (int(X)+1023+1)&-2048 != 1
7539  // Do 2- first because addl just updated the flags.
7540  cmov32(Assembler::equal,rax,rcx);
7541  cmpl(rdx,1);
7542  cmov32(Assembler::equal,rax,rcx);
7543  testl(rdx,rcx);
7544  cmov32(Assembler::notEqual,rax,rcx);
7545  movl(Address(rsp,4),rax);
7546  movl(Address(rsp,0),0);
7547  fmul_d(Address(rsp,0));   // Stack: 2^X ...
7548  addptr(rsp,sizeof(jdouble));
7549}
7550
7551void MacroAssembler::increase_precision() {
7552  subptr(rsp, BytesPerWord);
7553  fnstcw(Address(rsp, 0));
7554  movl(rax, Address(rsp, 0));
7555  orl(rax, 0x300);
7556  push(rax);
7557  fldcw(Address(rsp, 0));
7558  pop(rax);
7559}
7560
7561void MacroAssembler::restore_precision() {
7562  fldcw(Address(rsp, 0));
7563  addptr(rsp, BytesPerWord);
7564}
7565
7566void MacroAssembler::fast_pow() {
7567  // computes X^Y = 2^(Y * log2(X))
7568  // if fast computation is not possible, result is NaN. Requires
7569  // fallback from user of this macro.
7570  // increase precision for intermediate steps of the computation
7571  increase_precision();
7572  fyl2x();                 // Stack: (Y*log2(X)) ...
7573  pow_exp_core_encoding(); // Stack: exp(X) ...
7574  restore_precision();
7575}
7576
7577void MacroAssembler::fast_exp() {
7578  // computes exp(X) = 2^(X * log2(e))
7579  // if fast computation is not possible, result is NaN. Requires
7580  // fallback from user of this macro.
7581  // increase precision for intermediate steps of the computation
7582  increase_precision();
7583  fldl2e();                // Stack: log2(e) X ...
7584  fmulp(1);                // Stack: (X*log2(e)) ...
7585  pow_exp_core_encoding(); // Stack: exp(X) ...
7586  restore_precision();
7587}
7588
7589void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
7590  // kills rax, rcx, rdx
7591  // pow and exp needs 2 extra registers on the fpu stack.
7592  Label slow_case, done;
7593  Register tmp = noreg;
7594  if (!VM_Version::supports_cmov()) {
7595    // fcmp needs a temporary so preserve rdx,
7596    tmp = rdx;
7597  }
7598  Register tmp2 = rax;
7599  Register tmp3 = rcx;
7600
7601  if (is_exp) {
7602    // Stack: X
7603    fld_s(0);                   // duplicate argument for runtime call. Stack: X X
7604    fast_exp();                 // Stack: exp(X) X
7605    fcmp(tmp, 0, false, false); // Stack: exp(X) X
7606    // exp(X) not equal to itself: exp(X) is NaN go to slow case.
7607    jcc(Assembler::parity, slow_case);
7608    // get rid of duplicate argument. Stack: exp(X)
7609    if (num_fpu_regs_in_use > 0) {
7610      fxch();
7611      fpop();
7612    } else {
7613      ffree(1);
7614    }
7615    jmp(done);
7616  } else {
7617    // Stack: X Y
7618    Label x_negative, y_odd;
7619
7620    fldz();                     // Stack: 0 X Y
7621    fcmp(tmp, 1, true, false);  // Stack: X Y
7622    jcc(Assembler::above, x_negative);
7623
7624    // X >= 0
7625
7626    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
7627    fld_s(1);                   // Stack: X Y X Y
7628    fast_pow();                 // Stack: X^Y X Y
7629    fcmp(tmp, 0, false, false); // Stack: X^Y X Y
7630    // X^Y not equal to itself: X^Y is NaN go to slow case.
7631    jcc(Assembler::parity, slow_case);
7632    // get rid of duplicate arguments. Stack: X^Y
7633    if (num_fpu_regs_in_use > 0) {
7634      fxch(); fpop();
7635      fxch(); fpop();
7636    } else {
7637      ffree(2);
7638      ffree(1);
7639    }
7640    jmp(done);
7641
7642    // X <= 0
7643    bind(x_negative);
7644
7645    fld_s(1);                   // Stack: Y X Y
7646    frndint();                  // Stack: int(Y) X Y
7647    fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
7648    jcc(Assembler::notEqual, slow_case);
7649
7650    subptr(rsp, 8);
7651
7652    // For X^Y, when X < 0, Y has to be an integer and the final
7653    // result depends on whether it's odd or even. We just checked
7654    // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
7655    // integer to test its parity. If int(Y) is huge and doesn't fit
7656    // in the 64 bit integer range, the integer indefinite value will
7657    // end up in the gp registers. Huge numbers are all even, the
7658    // integer indefinite number is even so it's fine.
7659
7660#ifdef ASSERT
7661    // Let's check we don't end up with an integer indefinite number
7662    // when not expected. First test for huge numbers: check whether
7663    // int(Y)+1 == int(Y) which is true for very large numbers and
7664    // those are all even. A 64 bit integer is guaranteed to not
7665    // overflow for numbers where y+1 != y (when precision is set to
7666    // double precision).
7667    Label y_not_huge;
7668
7669    fld1();                     // Stack: 1 int(Y) X Y
7670    fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
7671
7672#ifdef _LP64
7673    // trip to memory to force the precision down from double extended
7674    // precision
7675    fstp_d(Address(rsp, 0));
7676    fld_d(Address(rsp, 0));
7677#endif
7678
7679    fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
7680#endif
7681
7682    // move int(Y) as 64 bit integer to thread's stack
7683    fistp_d(Address(rsp,0));    // Stack: X Y
7684
7685#ifdef ASSERT
7686    jcc(Assembler::notEqual, y_not_huge);
7687
7688    // Y is huge so we know it's even. It may not fit in a 64 bit
7689    // integer and we don't want the debug code below to see the
7690    // integer indefinite value so overwrite int(Y) on the thread's
7691    // stack with 0.
7692    movl(Address(rsp, 0), 0);
7693    movl(Address(rsp, 4), 0);
7694
7695    bind(y_not_huge);
7696#endif
7697
7698    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
7699    fld_s(1);                   // Stack: X Y X Y
7700    fabs();                     // Stack: abs(X) Y X Y
7701    fast_pow();                 // Stack: abs(X)^Y X Y
7702    fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
7703    // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
7704
7705    pop(tmp2);
7706    NOT_LP64(pop(tmp3));
7707    jcc(Assembler::parity, slow_case);
7708
7709#ifdef ASSERT
7710    // Check that int(Y) is not integer indefinite value (int
7711    // overflow). Shouldn't happen because for values that would
7712    // overflow, 1+int(Y)==Y which was tested earlier.
7713#ifndef _LP64
7714    {
7715      Label integer;
7716      testl(tmp2, tmp2);
7717      jcc(Assembler::notZero, integer);
7718      cmpl(tmp3, 0x80000000);
7719      jcc(Assembler::notZero, integer);
7720      STOP("integer indefinite value shouldn't be seen here");
7721      bind(integer);
7722    }
7723#else
7724    {
7725      Label integer;
7726      mov(tmp3, tmp2); // preserve tmp2 for parity check below
7727      shlq(tmp3, 1);
7728      jcc(Assembler::carryClear, integer);
7729      jcc(Assembler::notZero, integer);
7730      STOP("integer indefinite value shouldn't be seen here");
7731      bind(integer);
7732    }
7733#endif
7734#endif
7735
7736    // get rid of duplicate arguments. Stack: X^Y
7737    if (num_fpu_regs_in_use > 0) {
7738      fxch(); fpop();
7739      fxch(); fpop();
7740    } else {
7741      ffree(2);
7742      ffree(1);
7743    }
7744
7745    testl(tmp2, 1);
7746    jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
7747    // X <= 0, Y even: X^Y = -abs(X)^Y
7748
7749    fchs();                     // Stack: -abs(X)^Y Y
7750    jmp(done);
7751  }
7752
7753  // slow case: runtime call
7754  bind(slow_case);
7755
7756  fpop();                       // pop incorrect result or int(Y)
7757
7758  fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
7759                      is_exp ? 1 : 2, num_fpu_regs_in_use);
7760
7761  // Come here with result in F-TOS
7762  bind(done);
7763}
7764
7765void MacroAssembler::fpop() {
7766  ffree();
7767  fincstp();
7768}
7769
7770void MacroAssembler::fremr(Register tmp) {
7771  save_rax(tmp);
7772  { Label L;
7773    bind(L);
7774    fprem();
7775    fwait(); fnstsw_ax();
7776#ifdef _LP64
7777    testl(rax, 0x400);
7778    jcc(Assembler::notEqual, L);
7779#else
7780    sahf();
7781    jcc(Assembler::parity, L);
7782#endif // _LP64
7783  }
7784  restore_rax(tmp);
7785  // Result is in ST0.
7786  // Note: fxch & fpop to get rid of ST1
7787  // (otherwise FPU stack could overflow eventually)
7788  fxch(1);
7789  fpop();
7790}
7791
7792
7793void MacroAssembler::incrementl(AddressLiteral dst) {
7794  if (reachable(dst)) {
7795    incrementl(as_Address(dst));
7796  } else {
7797    lea(rscratch1, dst);
7798    incrementl(Address(rscratch1, 0));
7799  }
7800}
7801
7802void MacroAssembler::incrementl(ArrayAddress dst) {
7803  incrementl(as_Address(dst));
7804}
7805
7806void MacroAssembler::incrementl(Register reg, int value) {
7807  if (value == min_jint) {addl(reg, value) ; return; }
7808  if (value <  0) { decrementl(reg, -value); return; }
7809  if (value == 0) {                        ; return; }
7810  if (value == 1 && UseIncDec) { incl(reg) ; return; }
7811  /* else */      { addl(reg, value)       ; return; }
7812}
7813
7814void MacroAssembler::incrementl(Address dst, int value) {
7815  if (value == min_jint) {addl(dst, value) ; return; }
7816  if (value <  0) { decrementl(dst, -value); return; }
7817  if (value == 0) {                        ; return; }
7818  if (value == 1 && UseIncDec) { incl(dst) ; return; }
7819  /* else */      { addl(dst, value)       ; return; }
7820}
7821
7822void MacroAssembler::jump(AddressLiteral dst) {
7823  if (reachable(dst)) {
7824    jmp_literal(dst.target(), dst.rspec());
7825  } else {
7826    lea(rscratch1, dst);
7827    jmp(rscratch1);
7828  }
7829}
7830
7831void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
7832  if (reachable(dst)) {
7833    InstructionMark im(this);
7834    relocate(dst.reloc());
7835    const int short_size = 2;
7836    const int long_size = 6;
7837    int offs = (intptr_t)dst.target() - ((intptr_t)_code_pos);
7838    if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
7839      // 0111 tttn #8-bit disp
7840      emit_byte(0x70 | cc);
7841      emit_byte((offs - short_size) & 0xFF);
7842    } else {
7843      // 0000 1111 1000 tttn #32-bit disp
7844      emit_byte(0x0F);
7845      emit_byte(0x80 | cc);
7846      emit_long(offs - long_size);
7847    }
7848  } else {
7849#ifdef ASSERT
7850    warning("reversing conditional branch");
7851#endif /* ASSERT */
7852    Label skip;
7853    jccb(reverse[cc], skip);
7854    lea(rscratch1, dst);
7855    Assembler::jmp(rscratch1);
7856    bind(skip);
7857  }
7858}
7859
7860void MacroAssembler::ldmxcsr(AddressLiteral src) {
7861  if (reachable(src)) {
7862    Assembler::ldmxcsr(as_Address(src));
7863  } else {
7864    lea(rscratch1, src);
7865    Assembler::ldmxcsr(Address(rscratch1, 0));
7866  }
7867}
7868
7869int MacroAssembler::load_signed_byte(Register dst, Address src) {
7870  int off;
7871  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
7872    off = offset();
7873    movsbl(dst, src); // movsxb
7874  } else {
7875    off = load_unsigned_byte(dst, src);
7876    shll(dst, 24);
7877    sarl(dst, 24);
7878  }
7879  return off;
7880}
7881
7882// Note: load_signed_short used to be called load_signed_word.
7883// Although the 'w' in x86 opcodes refers to the term "word" in the assembler
7884// manual, which means 16 bits, that usage is found nowhere in HotSpot code.
7885// The term "word" in HotSpot means a 32- or 64-bit machine word.
7886int MacroAssembler::load_signed_short(Register dst, Address src) {
7887  int off;
7888  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
7889    // This is dubious to me since it seems safe to do a signed 16 => 64 bit
7890    // version but this is what 64bit has always done. This seems to imply
7891    // that users are only using 32bits worth.
7892    off = offset();
7893    movswl(dst, src); // movsxw
7894  } else {
7895    off = load_unsigned_short(dst, src);
7896    shll(dst, 16);
7897    sarl(dst, 16);
7898  }
7899  return off;
7900}
7901
7902int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
7903  // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
7904  // and "3.9 Partial Register Penalties", p. 22).
7905  int off;
7906  if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
7907    off = offset();
7908    movzbl(dst, src); // movzxb
7909  } else {
7910    xorl(dst, dst);
7911    off = offset();
7912    movb(dst, src);
7913  }
7914  return off;
7915}
7916
7917// Note: load_unsigned_short used to be called load_unsigned_word.
7918int MacroAssembler::load_unsigned_short(Register dst, Address src) {
7919  // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
7920  // and "3.9 Partial Register Penalties", p. 22).
7921  int off;
7922  if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
7923    off = offset();
7924    movzwl(dst, src); // movzxw
7925  } else {
7926    xorl(dst, dst);
7927    off = offset();
7928    movw(dst, src);
7929  }
7930  return off;
7931}
7932
7933void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
7934  switch (size_in_bytes) {
7935#ifndef _LP64
7936  case  8:
7937    assert(dst2 != noreg, "second dest register required");
7938    movl(dst,  src);
7939    movl(dst2, src.plus_disp(BytesPerInt));
7940    break;
7941#else
7942  case  8:  movq(dst, src); break;
7943#endif
7944  case  4:  movl(dst, src); break;
7945  case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
7946  case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
7947  default:  ShouldNotReachHere();
7948  }
7949}
7950
7951void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
7952  switch (size_in_bytes) {
7953#ifndef _LP64
7954  case  8:
7955    assert(src2 != noreg, "second source register required");
7956    movl(dst,                        src);
7957    movl(dst.plus_disp(BytesPerInt), src2);
7958    break;
7959#else
7960  case  8:  movq(dst, src); break;
7961#endif
7962  case  4:  movl(dst, src); break;
7963  case  2:  movw(dst, src); break;
7964  case  1:  movb(dst, src); break;
7965  default:  ShouldNotReachHere();
7966  }
7967}
7968
7969void MacroAssembler::mov32(AddressLiteral dst, Register src) {
7970  if (reachable(dst)) {
7971    movl(as_Address(dst), src);
7972  } else {
7973    lea(rscratch1, dst);
7974    movl(Address(rscratch1, 0), src);
7975  }
7976}
7977
7978void MacroAssembler::mov32(Register dst, AddressLiteral src) {
7979  if (reachable(src)) {
7980    movl(dst, as_Address(src));
7981  } else {
7982    lea(rscratch1, src);
7983    movl(dst, Address(rscratch1, 0));
7984  }
7985}
7986
7987// C++ bool manipulation
7988
7989void MacroAssembler::movbool(Register dst, Address src) {
7990  if(sizeof(bool) == 1)
7991    movb(dst, src);
7992  else if(sizeof(bool) == 2)
7993    movw(dst, src);
7994  else if(sizeof(bool) == 4)
7995    movl(dst, src);
7996  else
7997    // unsupported
7998    ShouldNotReachHere();
7999}
8000
8001void MacroAssembler::movbool(Address dst, bool boolconst) {
8002  if(sizeof(bool) == 1)
8003    movb(dst, (int) boolconst);
8004  else if(sizeof(bool) == 2)
8005    movw(dst, (int) boolconst);
8006  else if(sizeof(bool) == 4)
8007    movl(dst, (int) boolconst);
8008  else
8009    // unsupported
8010    ShouldNotReachHere();
8011}
8012
8013void MacroAssembler::movbool(Address dst, Register src) {
8014  if(sizeof(bool) == 1)
8015    movb(dst, src);
8016  else if(sizeof(bool) == 2)
8017    movw(dst, src);
8018  else if(sizeof(bool) == 4)
8019    movl(dst, src);
8020  else
8021    // unsupported
8022    ShouldNotReachHere();
8023}
8024
8025void MacroAssembler::movbyte(ArrayAddress dst, int src) {
8026  movb(as_Address(dst), src);
8027}
8028
8029void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
8030  if (reachable(src)) {
8031    movdl(dst, as_Address(src));
8032  } else {
8033    lea(rscratch1, src);
8034    movdl(dst, Address(rscratch1, 0));
8035  }
8036}
8037
8038void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
8039  if (reachable(src)) {
8040    movq(dst, as_Address(src));
8041  } else {
8042    lea(rscratch1, src);
8043    movq(dst, Address(rscratch1, 0));
8044  }
8045}
8046
8047void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
8048  if (reachable(src)) {
8049    if (UseXmmLoadAndClearUpper) {
8050      movsd (dst, as_Address(src));
8051    } else {
8052      movlpd(dst, as_Address(src));
8053    }
8054  } else {
8055    lea(rscratch1, src);
8056    if (UseXmmLoadAndClearUpper) {
8057      movsd (dst, Address(rscratch1, 0));
8058    } else {
8059      movlpd(dst, Address(rscratch1, 0));
8060    }
8061  }
8062}
8063
8064void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
8065  if (reachable(src)) {
8066    movss(dst, as_Address(src));
8067  } else {
8068    lea(rscratch1, src);
8069    movss(dst, Address(rscratch1, 0));
8070  }
8071}
8072
8073void MacroAssembler::movptr(Register dst, Register src) {
8074  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
8075}
8076
8077void MacroAssembler::movptr(Register dst, Address src) {
8078  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
8079}
8080
8081// src should NEVER be a real pointer. Use AddressLiteral for true pointers
8082void MacroAssembler::movptr(Register dst, intptr_t src) {
8083  LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
8084}
8085
8086void MacroAssembler::movptr(Address dst, Register src) {
8087  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
8088}
8089
8090void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
8091  if (reachable(src)) {
8092    Assembler::movsd(dst, as_Address(src));
8093  } else {
8094    lea(rscratch1, src);
8095    Assembler::movsd(dst, Address(rscratch1, 0));
8096  }
8097}
8098
8099void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
8100  if (reachable(src)) {
8101    Assembler::movss(dst, as_Address(src));
8102  } else {
8103    lea(rscratch1, src);
8104    Assembler::movss(dst, Address(rscratch1, 0));
8105  }
8106}
8107
8108void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
8109  if (reachable(src)) {
8110    Assembler::mulsd(dst, as_Address(src));
8111  } else {
8112    lea(rscratch1, src);
8113    Assembler::mulsd(dst, Address(rscratch1, 0));
8114  }
8115}
8116
8117void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
8118  if (reachable(src)) {
8119    Assembler::mulss(dst, as_Address(src));
8120  } else {
8121    lea(rscratch1, src);
8122    Assembler::mulss(dst, Address(rscratch1, 0));
8123  }
8124}
8125
8126void MacroAssembler::null_check(Register reg, int offset) {
8127  if (needs_explicit_null_check(offset)) {
8128    // provoke OS NULL exception if reg = NULL by
8129    // accessing M[reg] w/o changing any (non-CC) registers
8130    // NOTE: cmpl is plenty here to provoke a segv
8131    cmpptr(rax, Address(reg, 0));
8132    // Note: should probably use testl(rax, Address(reg, 0));
8133    //       may be shorter code (however, this version of
8134    //       testl needs to be implemented first)
8135  } else {
8136    // nothing to do, (later) access of M[reg + offset]
8137    // will provoke OS NULL exception if reg = NULL
8138  }
8139}
8140
8141void MacroAssembler::os_breakpoint() {
8142  // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
8143  // (e.g., MSVC can't call ps() otherwise)
8144  call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
8145}
8146
8147void MacroAssembler::pop_CPU_state() {
8148  pop_FPU_state();
8149  pop_IU_state();
8150}
8151
8152void MacroAssembler::pop_FPU_state() {
8153  NOT_LP64(frstor(Address(rsp, 0));)
8154  LP64_ONLY(fxrstor(Address(rsp, 0));)
8155  addptr(rsp, FPUStateSizeInWords * wordSize);
8156}
8157
8158void MacroAssembler::pop_IU_state() {
8159  popa();
8160  LP64_ONLY(addq(rsp, 8));
8161  popf();
8162}
8163
8164// Save Integer and Float state
8165// Warning: Stack must be 16 byte aligned (64bit)
8166void MacroAssembler::push_CPU_state() {
8167  push_IU_state();
8168  push_FPU_state();
8169}
8170
8171void MacroAssembler::push_FPU_state() {
8172  subptr(rsp, FPUStateSizeInWords * wordSize);
8173#ifndef _LP64
8174  fnsave(Address(rsp, 0));
8175  fwait();
8176#else
8177  fxsave(Address(rsp, 0));
8178#endif // LP64
8179}
8180
8181void MacroAssembler::push_IU_state() {
8182  // Push flags first because pusha kills them
8183  pushf();
8184  // Make sure rsp stays 16-byte aligned
8185  LP64_ONLY(subq(rsp, 8));
8186  pusha();
8187}
8188
8189void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
8190  // determine java_thread register
8191  if (!java_thread->is_valid()) {
8192    java_thread = rdi;
8193    get_thread(java_thread);
8194  }
8195  // we must set sp to zero to clear frame
8196  movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
8197  if (clear_fp) {
8198    movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
8199  }
8200
8201  if (clear_pc)
8202    movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
8203
8204}
8205
8206void MacroAssembler::restore_rax(Register tmp) {
8207  if (tmp == noreg) pop(rax);
8208  else if (tmp != rax) mov(rax, tmp);
8209}
8210
8211void MacroAssembler::round_to(Register reg, int modulus) {
8212  addptr(reg, modulus - 1);
8213  andptr(reg, -modulus);
8214}
8215
8216void MacroAssembler::save_rax(Register tmp) {
8217  if (tmp == noreg) push(rax);
8218  else if (tmp != rax) mov(tmp, rax);
8219}
8220
8221// Write serialization page so VM thread can do a pseudo remote membar.
8222// We use the current thread pointer to calculate a thread specific
8223// offset to write to within the page. This minimizes bus traffic
8224// due to cache line collision.
8225void MacroAssembler::serialize_memory(Register thread, Register tmp) {
8226  movl(tmp, thread);
8227  shrl(tmp, os::get_serialize_page_shift_count());
8228  andl(tmp, (os::vm_page_size() - sizeof(int)));
8229
8230  Address index(noreg, tmp, Address::times_1);
8231  ExternalAddress page(os::get_memory_serialize_page());
8232
8233  // Size of store must match masking code above
8234  movl(as_Address(ArrayAddress(page, index)), tmp);
8235}
8236
8237// Calls to C land
8238//
8239// When entering C land, the rbp, & rsp of the last Java frame have to be recorded
8240// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
8241// has to be reset to 0. This is required to allow proper stack traversal.
8242void MacroAssembler::set_last_Java_frame(Register java_thread,
8243                                         Register last_java_sp,
8244                                         Register last_java_fp,
8245                                         address  last_java_pc) {
8246  // determine java_thread register
8247  if (!java_thread->is_valid()) {
8248    java_thread = rdi;
8249    get_thread(java_thread);
8250  }
8251  // determine last_java_sp register
8252  if (!last_java_sp->is_valid()) {
8253    last_java_sp = rsp;
8254  }
8255
8256  // last_java_fp is optional
8257
8258  if (last_java_fp->is_valid()) {
8259    movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
8260  }
8261
8262  // last_java_pc is optional
8263
8264  if (last_java_pc != NULL) {
8265    lea(Address(java_thread,
8266                 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
8267        InternalAddress(last_java_pc));
8268
8269  }
8270  movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
8271}
8272
8273void MacroAssembler::shlptr(Register dst, int imm8) {
8274  LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
8275}
8276
8277void MacroAssembler::shrptr(Register dst, int imm8) {
8278  LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
8279}
8280
8281void MacroAssembler::sign_extend_byte(Register reg) {
8282  if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
8283    movsbl(reg, reg); // movsxb
8284  } else {
8285    shll(reg, 24);
8286    sarl(reg, 24);
8287  }
8288}
8289
8290void MacroAssembler::sign_extend_short(Register reg) {
8291  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
8292    movswl(reg, reg); // movsxw
8293  } else {
8294    shll(reg, 16);
8295    sarl(reg, 16);
8296  }
8297}
8298
8299void MacroAssembler::testl(Register dst, AddressLiteral src) {
8300  assert(reachable(src), "Address should be reachable");
8301  testl(dst, as_Address(src));
8302}
8303
8304void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
8305  if (reachable(src)) {
8306    Assembler::sqrtsd(dst, as_Address(src));
8307  } else {
8308    lea(rscratch1, src);
8309    Assembler::sqrtsd(dst, Address(rscratch1, 0));
8310  }
8311}
8312
8313void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
8314  if (reachable(src)) {
8315    Assembler::sqrtss(dst, as_Address(src));
8316  } else {
8317    lea(rscratch1, src);
8318    Assembler::sqrtss(dst, Address(rscratch1, 0));
8319  }
8320}
8321
8322void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
8323  if (reachable(src)) {
8324    Assembler::subsd(dst, as_Address(src));
8325  } else {
8326    lea(rscratch1, src);
8327    Assembler::subsd(dst, Address(rscratch1, 0));
8328  }
8329}
8330
8331void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
8332  if (reachable(src)) {
8333    Assembler::subss(dst, as_Address(src));
8334  } else {
8335    lea(rscratch1, src);
8336    Assembler::subss(dst, Address(rscratch1, 0));
8337  }
8338}
8339
8340void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
8341  if (reachable(src)) {
8342    Assembler::ucomisd(dst, as_Address(src));
8343  } else {
8344    lea(rscratch1, src);
8345    Assembler::ucomisd(dst, Address(rscratch1, 0));
8346  }
8347}
8348
8349void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
8350  if (reachable(src)) {
8351    Assembler::ucomiss(dst, as_Address(src));
8352  } else {
8353    lea(rscratch1, src);
8354    Assembler::ucomiss(dst, Address(rscratch1, 0));
8355  }
8356}
8357
8358void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
8359  // Used in sign-bit flipping with aligned address.
8360  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
8361  if (reachable(src)) {
8362    Assembler::xorpd(dst, as_Address(src));
8363  } else {
8364    lea(rscratch1, src);
8365    Assembler::xorpd(dst, Address(rscratch1, 0));
8366  }
8367}
8368
8369void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
8370  // Used in sign-bit flipping with aligned address.
8371  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
8372  if (reachable(src)) {
8373    Assembler::xorps(dst, as_Address(src));
8374  } else {
8375    lea(rscratch1, src);
8376    Assembler::xorps(dst, Address(rscratch1, 0));
8377  }
8378}
8379
8380// AVX 3-operands instructions
8381
8382void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8383  if (reachable(src)) {
8384    vaddsd(dst, nds, as_Address(src));
8385  } else {
8386    lea(rscratch1, src);
8387    vaddsd(dst, nds, Address(rscratch1, 0));
8388  }
8389}
8390
8391void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8392  if (reachable(src)) {
8393    vaddss(dst, nds, as_Address(src));
8394  } else {
8395    lea(rscratch1, src);
8396    vaddss(dst, nds, Address(rscratch1, 0));
8397  }
8398}
8399
8400void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
8401  if (reachable(src)) {
8402    vandpd(dst, nds, as_Address(src), vector256);
8403  } else {
8404    lea(rscratch1, src);
8405    vandpd(dst, nds, Address(rscratch1, 0), vector256);
8406  }
8407}
8408
8409void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
8410  if (reachable(src)) {
8411    vandps(dst, nds, as_Address(src), vector256);
8412  } else {
8413    lea(rscratch1, src);
8414    vandps(dst, nds, Address(rscratch1, 0), vector256);
8415  }
8416}
8417
8418void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8419  if (reachable(src)) {
8420    vdivsd(dst, nds, as_Address(src));
8421  } else {
8422    lea(rscratch1, src);
8423    vdivsd(dst, nds, Address(rscratch1, 0));
8424  }
8425}
8426
8427void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8428  if (reachable(src)) {
8429    vdivss(dst, nds, as_Address(src));
8430  } else {
8431    lea(rscratch1, src);
8432    vdivss(dst, nds, Address(rscratch1, 0));
8433  }
8434}
8435
8436void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8437  if (reachable(src)) {
8438    vmulsd(dst, nds, as_Address(src));
8439  } else {
8440    lea(rscratch1, src);
8441    vmulsd(dst, nds, Address(rscratch1, 0));
8442  }
8443}
8444
8445void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8446  if (reachable(src)) {
8447    vmulss(dst, nds, as_Address(src));
8448  } else {
8449    lea(rscratch1, src);
8450    vmulss(dst, nds, Address(rscratch1, 0));
8451  }
8452}
8453
8454void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8455  if (reachable(src)) {
8456    vsubsd(dst, nds, as_Address(src));
8457  } else {
8458    lea(rscratch1, src);
8459    vsubsd(dst, nds, Address(rscratch1, 0));
8460  }
8461}
8462
8463void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8464  if (reachable(src)) {
8465    vsubss(dst, nds, as_Address(src));
8466  } else {
8467    lea(rscratch1, src);
8468    vsubss(dst, nds, Address(rscratch1, 0));
8469  }
8470}
8471
8472void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
8473  if (reachable(src)) {
8474    vxorpd(dst, nds, as_Address(src), vector256);
8475  } else {
8476    lea(rscratch1, src);
8477    vxorpd(dst, nds, Address(rscratch1, 0), vector256);
8478  }
8479}
8480
8481void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
8482  if (reachable(src)) {
8483    vxorps(dst, nds, as_Address(src), vector256);
8484  } else {
8485    lea(rscratch1, src);
8486    vxorps(dst, nds, Address(rscratch1, 0), vector256);
8487  }
8488}
8489
8490
8491//////////////////////////////////////////////////////////////////////////////////
8492#ifndef SERIALGC
8493
8494void MacroAssembler::g1_write_barrier_pre(Register obj,
8495                                          Register pre_val,
8496                                          Register thread,
8497                                          Register tmp,
8498                                          bool tosca_live,
8499                                          bool expand_call) {
8500
8501  // If expand_call is true then we expand the call_VM_leaf macro
8502  // directly to skip generating the check by
8503  // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
8504
8505#ifdef _LP64
8506  assert(thread == r15_thread, "must be");
8507#endif // _LP64
8508
8509  Label done;
8510  Label runtime;
8511
8512  assert(pre_val != noreg, "check this code");
8513
8514  if (obj != noreg) {
8515    assert_different_registers(obj, pre_val, tmp);
8516    assert(pre_val != rax, "check this code");
8517  }
8518
8519  Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
8520                                       PtrQueue::byte_offset_of_active()));
8521  Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
8522                                       PtrQueue::byte_offset_of_index()));
8523  Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
8524                                       PtrQueue::byte_offset_of_buf()));
8525
8526
8527  // Is marking active?
8528  if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
8529    cmpl(in_progress, 0);
8530  } else {
8531    assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
8532    cmpb(in_progress, 0);
8533  }
8534  jcc(Assembler::equal, done);
8535
8536  // Do we need to load the previous value?
8537  if (obj != noreg) {
8538    load_heap_oop(pre_val, Address(obj, 0));
8539  }
8540
8541  // Is the previous value null?
8542  cmpptr(pre_val, (int32_t) NULL_WORD);
8543  jcc(Assembler::equal, done);
8544
8545  // Can we store original value in the thread's buffer?
8546  // Is index == 0?
8547  // (The index field is typed as size_t.)
8548
8549  movptr(tmp, index);                   // tmp := *index_adr
8550  cmpptr(tmp, 0);                       // tmp == 0?
8551  jcc(Assembler::equal, runtime);       // If yes, goto runtime
8552
8553  subptr(tmp, wordSize);                // tmp := tmp - wordSize
8554  movptr(index, tmp);                   // *index_adr := tmp
8555  addptr(tmp, buffer);                  // tmp := tmp + *buffer_adr
8556
8557  // Record the previous value
8558  movptr(Address(tmp, 0), pre_val);
8559  jmp(done);
8560
8561  bind(runtime);
8562  // save the live input values
8563  if(tosca_live) push(rax);
8564
8565  if (obj != noreg && obj != rax)
8566    push(obj);
8567
8568  if (pre_val != rax)
8569    push(pre_val);
8570
8571  // Calling the runtime using the regular call_VM_leaf mechanism generates
8572  // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
8573  // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
8574  //
8575  // If we care generating the pre-barrier without a frame (e.g. in the
8576  // intrinsified Reference.get() routine) then ebp might be pointing to
8577  // the caller frame and so this check will most likely fail at runtime.
8578  //
8579  // Expanding the call directly bypasses the generation of the check.
8580  // So when we do not have have a full interpreter frame on the stack
8581  // expand_call should be passed true.
8582
8583  NOT_LP64( push(thread); )
8584
8585  if (expand_call) {
8586    LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
8587    pass_arg1(this, thread);
8588    pass_arg0(this, pre_val);
8589    MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
8590  } else {
8591    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
8592  }
8593
8594  NOT_LP64( pop(thread); )
8595
8596  // save the live input values
8597  if (pre_val != rax)
8598    pop(pre_val);
8599
8600  if (obj != noreg && obj != rax)
8601    pop(obj);
8602
8603  if(tosca_live) pop(rax);
8604
8605  bind(done);
8606}
8607
8608void MacroAssembler::g1_write_barrier_post(Register store_addr,
8609                                           Register new_val,
8610                                           Register thread,
8611                                           Register tmp,
8612                                           Register tmp2) {
8613#ifdef _LP64
8614  assert(thread == r15_thread, "must be");
8615#endif // _LP64
8616
8617  Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
8618                                       PtrQueue::byte_offset_of_index()));
8619  Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
8620                                       PtrQueue::byte_offset_of_buf()));
8621
8622  BarrierSet* bs = Universe::heap()->barrier_set();
8623  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
8624  Label done;
8625  Label runtime;
8626
8627  // Does store cross heap regions?
8628
8629  movptr(tmp, store_addr);
8630  xorptr(tmp, new_val);
8631  shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
8632  jcc(Assembler::equal, done);
8633
8634  // crosses regions, storing NULL?
8635
8636  cmpptr(new_val, (int32_t) NULL_WORD);
8637  jcc(Assembler::equal, done);
8638
8639  // storing region crossing non-NULL, is card already dirty?
8640
8641  ExternalAddress cardtable((address) ct->byte_map_base);
8642  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
8643#ifdef _LP64
8644  const Register card_addr = tmp;
8645
8646  movq(card_addr, store_addr);
8647  shrq(card_addr, CardTableModRefBS::card_shift);
8648
8649  lea(tmp2, cardtable);
8650
8651  // get the address of the card
8652  addq(card_addr, tmp2);
8653#else
8654  const Register card_index = tmp;
8655
8656  movl(card_index, store_addr);
8657  shrl(card_index, CardTableModRefBS::card_shift);
8658
8659  Address index(noreg, card_index, Address::times_1);
8660  const Register card_addr = tmp;
8661  lea(card_addr, as_Address(ArrayAddress(cardtable, index)));
8662#endif
8663  cmpb(Address(card_addr, 0), 0);
8664  jcc(Assembler::equal, done);
8665
8666  // storing a region crossing, non-NULL oop, card is clean.
8667  // dirty card and log.
8668
8669  movb(Address(card_addr, 0), 0);
8670
8671  cmpl(queue_index, 0);
8672  jcc(Assembler::equal, runtime);
8673  subl(queue_index, wordSize);
8674  movptr(tmp2, buffer);
8675#ifdef _LP64
8676  movslq(rscratch1, queue_index);
8677  addq(tmp2, rscratch1);
8678  movq(Address(tmp2, 0), card_addr);
8679#else
8680  addl(tmp2, queue_index);
8681  movl(Address(tmp2, 0), card_index);
8682#endif
8683  jmp(done);
8684
8685  bind(runtime);
8686  // save the live input values
8687  push(store_addr);
8688  push(new_val);
8689#ifdef _LP64
8690  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
8691#else
8692  push(thread);
8693  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
8694  pop(thread);
8695#endif
8696  pop(new_val);
8697  pop(store_addr);
8698
8699  bind(done);
8700}
8701
8702#endif // SERIALGC
8703//////////////////////////////////////////////////////////////////////////////////
8704
8705
8706void MacroAssembler::store_check(Register obj) {
8707  // Does a store check for the oop in register obj. The content of
8708  // register obj is destroyed afterwards.
8709  store_check_part_1(obj);
8710  store_check_part_2(obj);
8711}
8712
8713void MacroAssembler::store_check(Register obj, Address dst) {
8714  store_check(obj);
8715}
8716
8717
8718// split the store check operation so that other instructions can be scheduled inbetween
8719void MacroAssembler::store_check_part_1(Register obj) {
8720  BarrierSet* bs = Universe::heap()->barrier_set();
8721  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
8722  shrptr(obj, CardTableModRefBS::card_shift);
8723}
8724
8725void MacroAssembler::store_check_part_2(Register obj) {
8726  BarrierSet* bs = Universe::heap()->barrier_set();
8727  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
8728  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
8729  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
8730
8731  // The calculation for byte_map_base is as follows:
8732  // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
8733  // So this essentially converts an address to a displacement and
8734  // it will never need to be relocated. On 64bit however the value may be too
8735  // large for a 32bit displacement
8736
8737  intptr_t disp = (intptr_t) ct->byte_map_base;
8738  if (is_simm32(disp)) {
8739    Address cardtable(noreg, obj, Address::times_1, disp);
8740    movb(cardtable, 0);
8741  } else {
8742    // By doing it as an ExternalAddress disp could be converted to a rip-relative
8743    // displacement and done in a single instruction given favorable mapping and
8744    // a smarter version of as_Address. Worst case it is two instructions which
8745    // is no worse off then loading disp into a register and doing as a simple
8746    // Address() as above.
8747    // We can't do as ExternalAddress as the only style since if disp == 0 we'll
8748    // assert since NULL isn't acceptable in a reloci (see 6644928). In any case
8749    // in some cases we'll get a single instruction version.
8750
8751    ExternalAddress cardtable((address)disp);
8752    Address index(noreg, obj, Address::times_1);
8753    movb(as_Address(ArrayAddress(cardtable, index)), 0);
8754  }
8755}
8756
8757void MacroAssembler::subptr(Register dst, int32_t imm32) {
8758  LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
8759}
8760
8761// Force generation of a 4 byte immediate value even if it fits into 8bit
8762void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
8763  LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
8764}
8765
8766void MacroAssembler::subptr(Register dst, Register src) {
8767  LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
8768}
8769
8770// C++ bool manipulation
8771void MacroAssembler::testbool(Register dst) {
8772  if(sizeof(bool) == 1)
8773    testb(dst, 0xff);
8774  else if(sizeof(bool) == 2) {
8775    // testw implementation needed for two byte bools
8776    ShouldNotReachHere();
8777  } else if(sizeof(bool) == 4)
8778    testl(dst, dst);
8779  else
8780    // unsupported
8781    ShouldNotReachHere();
8782}
8783
8784void MacroAssembler::testptr(Register dst, Register src) {
8785  LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
8786}
8787
8788// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
8789void MacroAssembler::tlab_allocate(Register obj,
8790                                   Register var_size_in_bytes,
8791                                   int con_size_in_bytes,
8792                                   Register t1,
8793                                   Register t2,
8794                                   Label& slow_case) {
8795  assert_different_registers(obj, t1, t2);
8796  assert_different_registers(obj, var_size_in_bytes, t1);
8797  Register end = t2;
8798  Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
8799
8800  verify_tlab();
8801
8802  NOT_LP64(get_thread(thread));
8803
8804  movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
8805  if (var_size_in_bytes == noreg) {
8806    lea(end, Address(obj, con_size_in_bytes));
8807  } else {
8808    lea(end, Address(obj, var_size_in_bytes, Address::times_1));
8809  }
8810  cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
8811  jcc(Assembler::above, slow_case);
8812
8813  // update the tlab top pointer
8814  movptr(Address(thread, JavaThread::tlab_top_offset()), end);
8815
8816  // recover var_size_in_bytes if necessary
8817  if (var_size_in_bytes == end) {
8818    subptr(var_size_in_bytes, obj);
8819  }
8820  verify_tlab();
8821}
8822
8823// Preserves rbx, and rdx.
8824Register MacroAssembler::tlab_refill(Label& retry,
8825                                     Label& try_eden,
8826                                     Label& slow_case) {
8827  Register top = rax;
8828  Register t1  = rcx;
8829  Register t2  = rsi;
8830  Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
8831  assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
8832  Label do_refill, discard_tlab;
8833
8834  if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
8835    // No allocation in the shared eden.
8836    jmp(slow_case);
8837  }
8838
8839  NOT_LP64(get_thread(thread_reg));
8840
8841  movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
8842  movptr(t1,  Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
8843
8844  // calculate amount of free space
8845  subptr(t1, top);
8846  shrptr(t1, LogHeapWordSize);
8847
8848  // Retain tlab and allocate object in shared space if
8849  // the amount free in the tlab is too large to discard.
8850  cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
8851  jcc(Assembler::lessEqual, discard_tlab);
8852
8853  // Retain
8854  // %%% yuck as movptr...
8855  movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
8856  addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2);
8857  if (TLABStats) {
8858    // increment number of slow_allocations
8859    addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1);
8860  }
8861  jmp(try_eden);
8862
8863  bind(discard_tlab);
8864  if (TLABStats) {
8865    // increment number of refills
8866    addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1);
8867    // accumulate wastage -- t1 is amount free in tlab
8868    addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1);
8869  }
8870
8871  // if tlab is currently allocated (top or end != null) then
8872  // fill [top, end + alignment_reserve) with array object
8873  testptr(top, top);
8874  jcc(Assembler::zero, do_refill);
8875
8876  // set up the mark word
8877  movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
8878  // set the length to the remaining space
8879  subptr(t1, typeArrayOopDesc::header_size(T_INT));
8880  addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
8881  shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
8882  movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
8883  // set klass to intArrayKlass
8884  // dubious reloc why not an oop reloc?
8885  movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
8886  // store klass last.  concurrent gcs assumes klass length is valid if
8887  // klass field is not null.
8888  store_klass(top, t1);
8889
8890  movptr(t1, top);
8891  subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
8892  incr_allocated_bytes(thread_reg, t1, 0);
8893
8894  // refill the tlab with an eden allocation
8895  bind(do_refill);
8896  movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
8897  shlptr(t1, LogHeapWordSize);
8898  // allocate new tlab, address returned in top
8899  eden_allocate(top, t1, 0, t2, slow_case);
8900
8901  // Check that t1 was preserved in eden_allocate.
8902#ifdef ASSERT
8903  if (UseTLAB) {
8904    Label ok;
8905    Register tsize = rsi;
8906    assert_different_registers(tsize, thread_reg, t1);
8907    push(tsize);
8908    movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
8909    shlptr(tsize, LogHeapWordSize);
8910    cmpptr(t1, tsize);
8911    jcc(Assembler::equal, ok);
8912    STOP("assert(t1 != tlab size)");
8913    should_not_reach_here();
8914
8915    bind(ok);
8916    pop(tsize);
8917  }
8918#endif
8919  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top);
8920  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top);
8921  addptr(top, t1);
8922  subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
8923  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
8924  verify_tlab();
8925  jmp(retry);
8926
8927  return thread_reg; // for use by caller
8928}
8929
8930void MacroAssembler::incr_allocated_bytes(Register thread,
8931                                          Register var_size_in_bytes,
8932                                          int con_size_in_bytes,
8933                                          Register t1) {
8934  if (!thread->is_valid()) {
8935#ifdef _LP64
8936    thread = r15_thread;
8937#else
8938    assert(t1->is_valid(), "need temp reg");
8939    thread = t1;
8940    get_thread(thread);
8941#endif
8942  }
8943
8944#ifdef _LP64
8945  if (var_size_in_bytes->is_valid()) {
8946    addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
8947  } else {
8948    addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
8949  }
8950#else
8951  if (var_size_in_bytes->is_valid()) {
8952    addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
8953  } else {
8954    addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
8955  }
8956  adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
8957#endif
8958}
8959
8960void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
8961  pusha();
8962
8963  // if we are coming from c1, xmm registers may be live
8964  int off = 0;
8965  if (UseSSE == 1)  {
8966    subptr(rsp, sizeof(jdouble)*8);
8967    movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
8968    movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
8969    movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
8970    movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
8971    movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
8972    movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
8973    movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
8974    movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
8975  } else if (UseSSE >= 2)  {
8976#ifdef COMPILER2
8977    if (MaxVectorSize > 16) {
8978      assert(UseAVX > 0, "256bit vectors are supported only with AVX");
8979      // Save upper half of YMM registes
8980      subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
8981      vextractf128h(Address(rsp,  0),xmm0);
8982      vextractf128h(Address(rsp, 16),xmm1);
8983      vextractf128h(Address(rsp, 32),xmm2);
8984      vextractf128h(Address(rsp, 48),xmm3);
8985      vextractf128h(Address(rsp, 64),xmm4);
8986      vextractf128h(Address(rsp, 80),xmm5);
8987      vextractf128h(Address(rsp, 96),xmm6);
8988      vextractf128h(Address(rsp,112),xmm7);
8989#ifdef _LP64
8990      vextractf128h(Address(rsp,128),xmm8);
8991      vextractf128h(Address(rsp,144),xmm9);
8992      vextractf128h(Address(rsp,160),xmm10);
8993      vextractf128h(Address(rsp,176),xmm11);
8994      vextractf128h(Address(rsp,192),xmm12);
8995      vextractf128h(Address(rsp,208),xmm13);
8996      vextractf128h(Address(rsp,224),xmm14);
8997      vextractf128h(Address(rsp,240),xmm15);
8998#endif
8999    }
9000#endif
9001    // Save whole 128bit (16 bytes) XMM regiters
9002    subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
9003    movdqu(Address(rsp,off++*16),xmm0);
9004    movdqu(Address(rsp,off++*16),xmm1);
9005    movdqu(Address(rsp,off++*16),xmm2);
9006    movdqu(Address(rsp,off++*16),xmm3);
9007    movdqu(Address(rsp,off++*16),xmm4);
9008    movdqu(Address(rsp,off++*16),xmm5);
9009    movdqu(Address(rsp,off++*16),xmm6);
9010    movdqu(Address(rsp,off++*16),xmm7);
9011#ifdef _LP64
9012    movdqu(Address(rsp,off++*16),xmm8);
9013    movdqu(Address(rsp,off++*16),xmm9);
9014    movdqu(Address(rsp,off++*16),xmm10);
9015    movdqu(Address(rsp,off++*16),xmm11);
9016    movdqu(Address(rsp,off++*16),xmm12);
9017    movdqu(Address(rsp,off++*16),xmm13);
9018    movdqu(Address(rsp,off++*16),xmm14);
9019    movdqu(Address(rsp,off++*16),xmm15);
9020#endif
9021  }
9022
9023  // Preserve registers across runtime call
9024  int incoming_argument_and_return_value_offset = -1;
9025  if (num_fpu_regs_in_use > 1) {
9026    // Must preserve all other FPU regs (could alternatively convert
9027    // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
9028    // FPU state, but can not trust C compiler)
9029    NEEDS_CLEANUP;
9030    // NOTE that in this case we also push the incoming argument(s) to
9031    // the stack and restore it later; we also use this stack slot to
9032    // hold the return value from dsin, dcos etc.
9033    for (int i = 0; i < num_fpu_regs_in_use; i++) {
9034      subptr(rsp, sizeof(jdouble));
9035      fstp_d(Address(rsp, 0));
9036    }
9037    incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
9038    for (int i = nb_args-1; i >= 0; i--) {
9039      fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
9040    }
9041  }
9042
9043  subptr(rsp, nb_args*sizeof(jdouble));
9044  for (int i = 0; i < nb_args; i++) {
9045    fstp_d(Address(rsp, i*sizeof(jdouble)));
9046  }
9047
9048#ifdef _LP64
9049  if (nb_args > 0) {
9050    movdbl(xmm0, Address(rsp, 0));
9051  }
9052  if (nb_args > 1) {
9053    movdbl(xmm1, Address(rsp, sizeof(jdouble)));
9054  }
9055  assert(nb_args <= 2, "unsupported number of args");
9056#endif // _LP64
9057
9058  // NOTE: we must not use call_VM_leaf here because that requires a
9059  // complete interpreter frame in debug mode -- same bug as 4387334
9060  // MacroAssembler::call_VM_leaf_base is perfectly safe and will
9061  // do proper 64bit abi
9062
9063  NEEDS_CLEANUP;
9064  // Need to add stack banging before this runtime call if it needs to
9065  // be taken; however, there is no generic stack banging routine at
9066  // the MacroAssembler level
9067
9068  MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
9069
9070#ifdef _LP64
9071  movsd(Address(rsp, 0), xmm0);
9072  fld_d(Address(rsp, 0));
9073#endif // _LP64
9074  addptr(rsp, sizeof(jdouble) * nb_args);
9075  if (num_fpu_regs_in_use > 1) {
9076    // Must save return value to stack and then restore entire FPU
9077    // stack except incoming arguments
9078    fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
9079    for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
9080      fld_d(Address(rsp, 0));
9081      addptr(rsp, sizeof(jdouble));
9082    }
9083    fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
9084    addptr(rsp, sizeof(jdouble) * nb_args);
9085  }
9086
9087  off = 0;
9088  if (UseSSE == 1)  {
9089    movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
9090    movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
9091    movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
9092    movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
9093    movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
9094    movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
9095    movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
9096    movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
9097    addptr(rsp, sizeof(jdouble)*8);
9098  } else if (UseSSE >= 2)  {
9099    // Restore whole 128bit (16 bytes) XMM regiters
9100    movdqu(xmm0, Address(rsp,off++*16));
9101    movdqu(xmm1, Address(rsp,off++*16));
9102    movdqu(xmm2, Address(rsp,off++*16));
9103    movdqu(xmm3, Address(rsp,off++*16));
9104    movdqu(xmm4, Address(rsp,off++*16));
9105    movdqu(xmm5, Address(rsp,off++*16));
9106    movdqu(xmm6, Address(rsp,off++*16));
9107    movdqu(xmm7, Address(rsp,off++*16));
9108#ifdef _LP64
9109    movdqu(xmm8, Address(rsp,off++*16));
9110    movdqu(xmm9, Address(rsp,off++*16));
9111    movdqu(xmm10, Address(rsp,off++*16));
9112    movdqu(xmm11, Address(rsp,off++*16));
9113    movdqu(xmm12, Address(rsp,off++*16));
9114    movdqu(xmm13, Address(rsp,off++*16));
9115    movdqu(xmm14, Address(rsp,off++*16));
9116    movdqu(xmm15, Address(rsp,off++*16));
9117#endif
9118    addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
9119#ifdef COMPILER2
9120    if (MaxVectorSize > 16) {
9121      // Restore upper half of YMM registes.
9122      vinsertf128h(xmm0, Address(rsp,  0));
9123      vinsertf128h(xmm1, Address(rsp, 16));
9124      vinsertf128h(xmm2, Address(rsp, 32));
9125      vinsertf128h(xmm3, Address(rsp, 48));
9126      vinsertf128h(xmm4, Address(rsp, 64));
9127      vinsertf128h(xmm5, Address(rsp, 80));
9128      vinsertf128h(xmm6, Address(rsp, 96));
9129      vinsertf128h(xmm7, Address(rsp,112));
9130#ifdef _LP64
9131      vinsertf128h(xmm8, Address(rsp,128));
9132      vinsertf128h(xmm9, Address(rsp,144));
9133      vinsertf128h(xmm10, Address(rsp,160));
9134      vinsertf128h(xmm11, Address(rsp,176));
9135      vinsertf128h(xmm12, Address(rsp,192));
9136      vinsertf128h(xmm13, Address(rsp,208));
9137      vinsertf128h(xmm14, Address(rsp,224));
9138      vinsertf128h(xmm15, Address(rsp,240));
9139#endif
9140      addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
9141    }
9142#endif
9143  }
9144  popa();
9145}
9146
9147static const double     pi_4 =  0.7853981633974483;
9148
9149void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
9150  // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
9151  // was attempted in this code; unfortunately it appears that the
9152  // switch to 80-bit precision and back causes this to be
9153  // unprofitable compared with simply performing a runtime call if
9154  // the argument is out of the (-pi/4, pi/4) range.
9155
9156  Register tmp = noreg;
9157  if (!VM_Version::supports_cmov()) {
9158    // fcmp needs a temporary so preserve rbx,
9159    tmp = rbx;
9160    push(tmp);
9161  }
9162
9163  Label slow_case, done;
9164
9165  ExternalAddress pi4_adr = (address)&pi_4;
9166  if (reachable(pi4_adr)) {
9167    // x ?<= pi/4
9168    fld_d(pi4_adr);
9169    fld_s(1);                // Stack:  X  PI/4  X
9170    fabs();                  // Stack: |X| PI/4  X
9171    fcmp(tmp);
9172    jcc(Assembler::above, slow_case);
9173
9174    // fastest case: -pi/4 <= x <= pi/4
9175    switch(trig) {
9176    case 's':
9177      fsin();
9178      break;
9179    case 'c':
9180      fcos();
9181      break;
9182    case 't':
9183      ftan();
9184      break;
9185    default:
9186      assert(false, "bad intrinsic");
9187      break;
9188    }
9189    jmp(done);
9190  }
9191
9192  // slow case: runtime call
9193  bind(slow_case);
9194
9195  switch(trig) {
9196  case 's':
9197    {
9198      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
9199    }
9200    break;
9201  case 'c':
9202    {
9203      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
9204    }
9205    break;
9206  case 't':
9207    {
9208      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
9209    }
9210    break;
9211  default:
9212    assert(false, "bad intrinsic");
9213    break;
9214  }
9215
9216  // Come here with result in F-TOS
9217  bind(done);
9218
9219  if (tmp != noreg) {
9220    pop(tmp);
9221  }
9222}
9223
9224
9225// Look up the method for a megamorphic invokeinterface call.
9226// The target method is determined by <intf_klass, itable_index>.
9227// The receiver klass is in recv_klass.
9228// On success, the result will be in method_result, and execution falls through.
9229// On failure, execution transfers to the given label.
9230void MacroAssembler::lookup_interface_method(Register recv_klass,
9231                                             Register intf_klass,
9232                                             RegisterOrConstant itable_index,
9233                                             Register method_result,
9234                                             Register scan_temp,
9235                                             Label& L_no_such_interface) {
9236  assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
9237  assert(itable_index.is_constant() || itable_index.as_register() == method_result,
9238         "caller must use same register for non-constant itable index as for method");
9239
9240  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
9241  int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
9242  int itentry_off = itableMethodEntry::method_offset_in_bytes();
9243  int scan_step   = itableOffsetEntry::size() * wordSize;
9244  int vte_size    = vtableEntry::size() * wordSize;
9245  Address::ScaleFactor times_vte_scale = Address::times_ptr;
9246  assert(vte_size == wordSize, "else adjust times_vte_scale");
9247
9248  movl(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
9249
9250  // %%% Could store the aligned, prescaled offset in the klassoop.
9251  lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
9252  if (HeapWordsPerLong > 1) {
9253    // Round up to align_object_offset boundary
9254    // see code for InstanceKlass::start_of_itable!
9255    round_to(scan_temp, BytesPerLong);
9256  }
9257
9258  // Adjust recv_klass by scaled itable_index, so we can free itable_index.
9259  assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
9260  lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
9261
9262  // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
9263  //   if (scan->interface() == intf) {
9264  //     result = (klass + scan->offset() + itable_index);
9265  //   }
9266  // }
9267  Label search, found_method;
9268
9269  for (int peel = 1; peel >= 0; peel--) {
9270    movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
9271    cmpptr(intf_klass, method_result);
9272
9273    if (peel) {
9274      jccb(Assembler::equal, found_method);
9275    } else {
9276      jccb(Assembler::notEqual, search);
9277      // (invert the test to fall through to found_method...)
9278    }
9279
9280    if (!peel)  break;
9281
9282    bind(search);
9283
9284    // Check that the previous entry is non-null.  A null entry means that
9285    // the receiver class doesn't implement the interface, and wasn't the
9286    // same as when the caller was compiled.
9287    testptr(method_result, method_result);
9288    jcc(Assembler::zero, L_no_such_interface);
9289    addptr(scan_temp, scan_step);
9290  }
9291
9292  bind(found_method);
9293
9294  // Got a hit.
9295  movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
9296  movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
9297}
9298
9299
9300// virtual method calling
9301void MacroAssembler::lookup_virtual_method(Register recv_klass,
9302                                           RegisterOrConstant vtable_index,
9303                                           Register method_result) {
9304  const int base = InstanceKlass::vtable_start_offset() * wordSize;
9305  assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
9306  Address vtable_entry_addr(recv_klass,
9307                            vtable_index, Address::times_ptr,
9308                            base + vtableEntry::method_offset_in_bytes());
9309  movptr(method_result, vtable_entry_addr);
9310}
9311
9312
9313void MacroAssembler::check_klass_subtype(Register sub_klass,
9314                           Register super_klass,
9315                           Register temp_reg,
9316                           Label& L_success) {
9317  Label L_failure;
9318  check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
9319  check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
9320  bind(L_failure);
9321}
9322
9323
9324void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
9325                                                   Register super_klass,
9326                                                   Register temp_reg,
9327                                                   Label* L_success,
9328                                                   Label* L_failure,
9329                                                   Label* L_slow_path,
9330                                        RegisterOrConstant super_check_offset) {
9331  assert_different_registers(sub_klass, super_klass, temp_reg);
9332  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
9333  if (super_check_offset.is_register()) {
9334    assert_different_registers(sub_klass, super_klass,
9335                               super_check_offset.as_register());
9336  } else if (must_load_sco) {
9337    assert(temp_reg != noreg, "supply either a temp or a register offset");
9338  }
9339
9340  Label L_fallthrough;
9341  int label_nulls = 0;
9342  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
9343  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
9344  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
9345  assert(label_nulls <= 1, "at most one NULL in the batch");
9346
9347  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
9348  int sco_offset = in_bytes(Klass::super_check_offset_offset());
9349  Address super_check_offset_addr(super_klass, sco_offset);
9350
9351  // Hacked jcc, which "knows" that L_fallthrough, at least, is in
9352  // range of a jccb.  If this routine grows larger, reconsider at
9353  // least some of these.
9354#define local_jcc(assembler_cond, label)                                \
9355  if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
9356  else                             jcc( assembler_cond, label) /*omit semi*/
9357
9358  // Hacked jmp, which may only be used just before L_fallthrough.
9359#define final_jmp(label)                                                \
9360  if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
9361  else                            jmp(label)                /*omit semi*/
9362
9363  // If the pointers are equal, we are done (e.g., String[] elements).
9364  // This self-check enables sharing of secondary supertype arrays among
9365  // non-primary types such as array-of-interface.  Otherwise, each such
9366  // type would need its own customized SSA.
9367  // We move this check to the front of the fast path because many
9368  // type checks are in fact trivially successful in this manner,
9369  // so we get a nicely predicted branch right at the start of the check.
9370  cmpptr(sub_klass, super_klass);
9371  local_jcc(Assembler::equal, *L_success);
9372
9373  // Check the supertype display:
9374  if (must_load_sco) {
9375    // Positive movl does right thing on LP64.
9376    movl(temp_reg, super_check_offset_addr);
9377    super_check_offset = RegisterOrConstant(temp_reg);
9378  }
9379  Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
9380  cmpptr(super_klass, super_check_addr); // load displayed supertype
9381
9382  // This check has worked decisively for primary supers.
9383  // Secondary supers are sought in the super_cache ('super_cache_addr').
9384  // (Secondary supers are interfaces and very deeply nested subtypes.)
9385  // This works in the same check above because of a tricky aliasing
9386  // between the super_cache and the primary super display elements.
9387  // (The 'super_check_addr' can address either, as the case requires.)
9388  // Note that the cache is updated below if it does not help us find
9389  // what we need immediately.
9390  // So if it was a primary super, we can just fail immediately.
9391  // Otherwise, it's the slow path for us (no success at this point).
9392
9393  if (super_check_offset.is_register()) {
9394    local_jcc(Assembler::equal, *L_success);
9395    cmpl(super_check_offset.as_register(), sc_offset);
9396    if (L_failure == &L_fallthrough) {
9397      local_jcc(Assembler::equal, *L_slow_path);
9398    } else {
9399      local_jcc(Assembler::notEqual, *L_failure);
9400      final_jmp(*L_slow_path);
9401    }
9402  } else if (super_check_offset.as_constant() == sc_offset) {
9403    // Need a slow path; fast failure is impossible.
9404    if (L_slow_path == &L_fallthrough) {
9405      local_jcc(Assembler::equal, *L_success);
9406    } else {
9407      local_jcc(Assembler::notEqual, *L_slow_path);
9408      final_jmp(*L_success);
9409    }
9410  } else {
9411    // No slow path; it's a fast decision.
9412    if (L_failure == &L_fallthrough) {
9413      local_jcc(Assembler::equal, *L_success);
9414    } else {
9415      local_jcc(Assembler::notEqual, *L_failure);
9416      final_jmp(*L_success);
9417    }
9418  }
9419
9420  bind(L_fallthrough);
9421
9422#undef local_jcc
9423#undef final_jmp
9424}
9425
9426
9427void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
9428                                                   Register super_klass,
9429                                                   Register temp_reg,
9430                                                   Register temp2_reg,
9431                                                   Label* L_success,
9432                                                   Label* L_failure,
9433                                                   bool set_cond_codes) {
9434  assert_different_registers(sub_klass, super_klass, temp_reg);
9435  if (temp2_reg != noreg)
9436    assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
9437#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
9438
9439  Label L_fallthrough;
9440  int label_nulls = 0;
9441  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
9442  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
9443  assert(label_nulls <= 1, "at most one NULL in the batch");
9444
9445  // a couple of useful fields in sub_klass:
9446  int ss_offset = in_bytes(Klass::secondary_supers_offset());
9447  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
9448  Address secondary_supers_addr(sub_klass, ss_offset);
9449  Address super_cache_addr(     sub_klass, sc_offset);
9450
9451  // Do a linear scan of the secondary super-klass chain.
9452  // This code is rarely used, so simplicity is a virtue here.
9453  // The repne_scan instruction uses fixed registers, which we must spill.
9454  // Don't worry too much about pre-existing connections with the input regs.
9455
9456  assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
9457  assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
9458
9459  // Get super_klass value into rax (even if it was in rdi or rcx).
9460  bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
9461  if (super_klass != rax || UseCompressedOops) {
9462    if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
9463    mov(rax, super_klass);
9464  }
9465  if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
9466  if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
9467
9468#ifndef PRODUCT
9469  int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
9470  ExternalAddress pst_counter_addr((address) pst_counter);
9471  NOT_LP64(  incrementl(pst_counter_addr) );
9472  LP64_ONLY( lea(rcx, pst_counter_addr) );
9473  LP64_ONLY( incrementl(Address(rcx, 0)) );
9474#endif //PRODUCT
9475
9476  // We will consult the secondary-super array.
9477  movptr(rdi, secondary_supers_addr);
9478  // Load the array length.  (Positive movl does right thing on LP64.)
9479  movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
9480  // Skip to start of data.
9481  addptr(rdi, Array<Klass*>::base_offset_in_bytes());
9482
9483  // Scan RCX words at [RDI] for an occurrence of RAX.
9484  // Set NZ/Z based on last compare.
9485  // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
9486  // not change flags (only scas instruction which is repeated sets flags).
9487  // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
9488
9489    testptr(rax,rax); // Set Z = 0
9490    repne_scan();
9491
9492  // Unspill the temp. registers:
9493  if (pushed_rdi)  pop(rdi);
9494  if (pushed_rcx)  pop(rcx);
9495  if (pushed_rax)  pop(rax);
9496
9497  if (set_cond_codes) {
9498    // Special hack for the AD files:  rdi is guaranteed non-zero.
9499    assert(!pushed_rdi, "rdi must be left non-NULL");
9500    // Also, the condition codes are properly set Z/NZ on succeed/failure.
9501  }
9502
9503  if (L_failure == &L_fallthrough)
9504        jccb(Assembler::notEqual, *L_failure);
9505  else  jcc(Assembler::notEqual, *L_failure);
9506
9507  // Success.  Cache the super we found and proceed in triumph.
9508  movptr(super_cache_addr, super_klass);
9509
9510  if (L_success != &L_fallthrough) {
9511    jmp(*L_success);
9512  }
9513
9514#undef IS_A_TEMP
9515
9516  bind(L_fallthrough);
9517}
9518
9519
9520void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
9521  if (VM_Version::supports_cmov()) {
9522    cmovl(cc, dst, src);
9523  } else {
9524    Label L;
9525    jccb(negate_condition(cc), L);
9526    movl(dst, src);
9527    bind(L);
9528  }
9529}
9530
9531void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
9532  if (VM_Version::supports_cmov()) {
9533    cmovl(cc, dst, src);
9534  } else {
9535    Label L;
9536    jccb(negate_condition(cc), L);
9537    movl(dst, src);
9538    bind(L);
9539  }
9540}
9541
9542void MacroAssembler::verify_oop(Register reg, const char* s) {
9543  if (!VerifyOops) return;
9544
9545  // Pass register number to verify_oop_subroutine
9546  char* b = new char[strlen(s) + 50];
9547  sprintf(b, "verify_oop: %s: %s", reg->name(), s);
9548  BLOCK_COMMENT("verify_oop {");
9549#ifdef _LP64
9550  push(rscratch1);                    // save r10, trashed by movptr()
9551#endif
9552  push(rax);                          // save rax,
9553  push(reg);                          // pass register argument
9554  ExternalAddress buffer((address) b);
9555  // avoid using pushptr, as it modifies scratch registers
9556  // and our contract is not to modify anything
9557  movptr(rax, buffer.addr());
9558  push(rax);
9559  // call indirectly to solve generation ordering problem
9560  movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
9561  call(rax);
9562  // Caller pops the arguments (oop, message) and restores rax, r10
9563  BLOCK_COMMENT("} verify_oop");
9564}
9565
9566
9567RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
9568                                                      Register tmp,
9569                                                      int offset) {
9570  intptr_t value = *delayed_value_addr;
9571  if (value != 0)
9572    return RegisterOrConstant(value + offset);
9573
9574  // load indirectly to solve generation ordering problem
9575  movptr(tmp, ExternalAddress((address) delayed_value_addr));
9576
9577#ifdef ASSERT
9578  { Label L;
9579    testptr(tmp, tmp);
9580    if (WizardMode) {
9581      jcc(Assembler::notZero, L);
9582      char* buf = new char[40];
9583      sprintf(buf, "DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]);
9584      STOP(buf);
9585    } else {
9586      jccb(Assembler::notZero, L);
9587      hlt();
9588    }
9589    bind(L);
9590  }
9591#endif
9592
9593  if (offset != 0)
9594    addptr(tmp, offset);
9595
9596  return RegisterOrConstant(tmp);
9597}
9598
9599
9600Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
9601                                         int extra_slot_offset) {
9602  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
9603  int stackElementSize = Interpreter::stackElementSize;
9604  int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
9605#ifdef ASSERT
9606  int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
9607  assert(offset1 - offset == stackElementSize, "correct arithmetic");
9608#endif
9609  Register             scale_reg    = noreg;
9610  Address::ScaleFactor scale_factor = Address::no_scale;
9611  if (arg_slot.is_constant()) {
9612    offset += arg_slot.as_constant() * stackElementSize;
9613  } else {
9614    scale_reg    = arg_slot.as_register();
9615    scale_factor = Address::times(stackElementSize);
9616  }
9617  offset += wordSize;           // return PC is on stack
9618  return Address(rsp, scale_reg, scale_factor, offset);
9619}
9620
9621
9622void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
9623  if (!VerifyOops) return;
9624
9625  // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
9626  // Pass register number to verify_oop_subroutine
9627  char* b = new char[strlen(s) + 50];
9628  sprintf(b, "verify_oop_addr: %s", s);
9629
9630#ifdef _LP64
9631  push(rscratch1);                    // save r10, trashed by movptr()
9632#endif
9633  push(rax);                          // save rax,
9634  // addr may contain rsp so we will have to adjust it based on the push
9635  // we just did (and on 64 bit we do two pushes)
9636  // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
9637  // stores rax into addr which is backwards of what was intended.
9638  if (addr.uses(rsp)) {
9639    lea(rax, addr);
9640    pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
9641  } else {
9642    pushptr(addr);
9643  }
9644
9645  ExternalAddress buffer((address) b);
9646  // pass msg argument
9647  // avoid using pushptr, as it modifies scratch registers
9648  // and our contract is not to modify anything
9649  movptr(rax, buffer.addr());
9650  push(rax);
9651
9652  // call indirectly to solve generation ordering problem
9653  movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
9654  call(rax);
9655  // Caller pops the arguments (addr, message) and restores rax, r10.
9656}
9657
9658void MacroAssembler::verify_tlab() {
9659#ifdef ASSERT
9660  if (UseTLAB && VerifyOops) {
9661    Label next, ok;
9662    Register t1 = rsi;
9663    Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
9664
9665    push(t1);
9666    NOT_LP64(push(thread_reg));
9667    NOT_LP64(get_thread(thread_reg));
9668
9669    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
9670    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
9671    jcc(Assembler::aboveEqual, next);
9672    STOP("assert(top >= start)");
9673    should_not_reach_here();
9674
9675    bind(next);
9676    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
9677    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
9678    jcc(Assembler::aboveEqual, ok);
9679    STOP("assert(top <= end)");
9680    should_not_reach_here();
9681
9682    bind(ok);
9683    NOT_LP64(pop(thread_reg));
9684    pop(t1);
9685  }
9686#endif
9687}
9688
9689class ControlWord {
9690 public:
9691  int32_t _value;
9692
9693  int  rounding_control() const        { return  (_value >> 10) & 3      ; }
9694  int  precision_control() const       { return  (_value >>  8) & 3      ; }
9695  bool precision() const               { return ((_value >>  5) & 1) != 0; }
9696  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
9697  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
9698  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
9699  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
9700  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
9701
9702  void print() const {
9703    // rounding control
9704    const char* rc;
9705    switch (rounding_control()) {
9706      case 0: rc = "round near"; break;
9707      case 1: rc = "round down"; break;
9708      case 2: rc = "round up  "; break;
9709      case 3: rc = "chop      "; break;
9710    };
9711    // precision control
9712    const char* pc;
9713    switch (precision_control()) {
9714      case 0: pc = "24 bits "; break;
9715      case 1: pc = "reserved"; break;
9716      case 2: pc = "53 bits "; break;
9717      case 3: pc = "64 bits "; break;
9718    };
9719    // flags
9720    char f[9];
9721    f[0] = ' ';
9722    f[1] = ' ';
9723    f[2] = (precision   ()) ? 'P' : 'p';
9724    f[3] = (underflow   ()) ? 'U' : 'u';
9725    f[4] = (overflow    ()) ? 'O' : 'o';
9726    f[5] = (zero_divide ()) ? 'Z' : 'z';
9727    f[6] = (denormalized()) ? 'D' : 'd';
9728    f[7] = (invalid     ()) ? 'I' : 'i';
9729    f[8] = '\x0';
9730    // output
9731    printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
9732  }
9733
9734};
9735
9736class StatusWord {
9737 public:
9738  int32_t _value;
9739
9740  bool busy() const                    { return ((_value >> 15) & 1) != 0; }
9741  bool C3() const                      { return ((_value >> 14) & 1) != 0; }
9742  bool C2() const                      { return ((_value >> 10) & 1) != 0; }
9743  bool C1() const                      { return ((_value >>  9) & 1) != 0; }
9744  bool C0() const                      { return ((_value >>  8) & 1) != 0; }
9745  int  top() const                     { return  (_value >> 11) & 7      ; }
9746  bool error_status() const            { return ((_value >>  7) & 1) != 0; }
9747  bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
9748  bool precision() const               { return ((_value >>  5) & 1) != 0; }
9749  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
9750  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
9751  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
9752  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
9753  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
9754
9755  void print() const {
9756    // condition codes
9757    char c[5];
9758    c[0] = (C3()) ? '3' : '-';
9759    c[1] = (C2()) ? '2' : '-';
9760    c[2] = (C1()) ? '1' : '-';
9761    c[3] = (C0()) ? '0' : '-';
9762    c[4] = '\x0';
9763    // flags
9764    char f[9];
9765    f[0] = (error_status()) ? 'E' : '-';
9766    f[1] = (stack_fault ()) ? 'S' : '-';
9767    f[2] = (precision   ()) ? 'P' : '-';
9768    f[3] = (underflow   ()) ? 'U' : '-';
9769    f[4] = (overflow    ()) ? 'O' : '-';
9770    f[5] = (zero_divide ()) ? 'Z' : '-';
9771    f[6] = (denormalized()) ? 'D' : '-';
9772    f[7] = (invalid     ()) ? 'I' : '-';
9773    f[8] = '\x0';
9774    // output
9775    printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
9776  }
9777
9778};
9779
9780class TagWord {
9781 public:
9782  int32_t _value;
9783
9784  int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
9785
9786  void print() const {
9787    printf("%04x", _value & 0xFFFF);
9788  }
9789
9790};
9791
9792class FPU_Register {
9793 public:
9794  int32_t _m0;
9795  int32_t _m1;
9796  int16_t _ex;
9797
9798  bool is_indefinite() const           {
9799    return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
9800  }
9801
9802  void print() const {
9803    char  sign = (_ex < 0) ? '-' : '+';
9804    const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
9805    printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
9806  };
9807
9808};
9809
9810class FPU_State {
9811 public:
9812  enum {
9813    register_size       = 10,
9814    number_of_registers =  8,
9815    register_mask       =  7
9816  };
9817
9818  ControlWord  _control_word;
9819  StatusWord   _status_word;
9820  TagWord      _tag_word;
9821  int32_t      _error_offset;
9822  int32_t      _error_selector;
9823  int32_t      _data_offset;
9824  int32_t      _data_selector;
9825  int8_t       _register[register_size * number_of_registers];
9826
9827  int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
9828  FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
9829
9830  const char* tag_as_string(int tag) const {
9831    switch (tag) {
9832      case 0: return "valid";
9833      case 1: return "zero";
9834      case 2: return "special";
9835      case 3: return "empty";
9836    }
9837    ShouldNotReachHere();
9838    return NULL;
9839  }
9840
9841  void print() const {
9842    // print computation registers
9843    { int t = _status_word.top();
9844      for (int i = 0; i < number_of_registers; i++) {
9845        int j = (i - t) & register_mask;
9846        printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
9847        st(j)->print();
9848        printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
9849      }
9850    }
9851    printf("\n");
9852    // print control registers
9853    printf("ctrl = "); _control_word.print(); printf("\n");
9854    printf("stat = "); _status_word .print(); printf("\n");
9855    printf("tags = "); _tag_word    .print(); printf("\n");
9856  }
9857
9858};
9859
9860class Flag_Register {
9861 public:
9862  int32_t _value;
9863
9864  bool overflow() const                { return ((_value >> 11) & 1) != 0; }
9865  bool direction() const               { return ((_value >> 10) & 1) != 0; }
9866  bool sign() const                    { return ((_value >>  7) & 1) != 0; }
9867  bool zero() const                    { return ((_value >>  6) & 1) != 0; }
9868  bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
9869  bool parity() const                  { return ((_value >>  2) & 1) != 0; }
9870  bool carry() const                   { return ((_value >>  0) & 1) != 0; }
9871
9872  void print() const {
9873    // flags
9874    char f[8];
9875    f[0] = (overflow       ()) ? 'O' : '-';
9876    f[1] = (direction      ()) ? 'D' : '-';
9877    f[2] = (sign           ()) ? 'S' : '-';
9878    f[3] = (zero           ()) ? 'Z' : '-';
9879    f[4] = (auxiliary_carry()) ? 'A' : '-';
9880    f[5] = (parity         ()) ? 'P' : '-';
9881    f[6] = (carry          ()) ? 'C' : '-';
9882    f[7] = '\x0';
9883    // output
9884    printf("%08x  flags = %s", _value, f);
9885  }
9886
9887};
9888
9889class IU_Register {
9890 public:
9891  int32_t _value;
9892
9893  void print() const {
9894    printf("%08x  %11d", _value, _value);
9895  }
9896
9897};
9898
9899class IU_State {
9900 public:
9901  Flag_Register _eflags;
9902  IU_Register   _rdi;
9903  IU_Register   _rsi;
9904  IU_Register   _rbp;
9905  IU_Register   _rsp;
9906  IU_Register   _rbx;
9907  IU_Register   _rdx;
9908  IU_Register   _rcx;
9909  IU_Register   _rax;
9910
9911  void print() const {
9912    // computation registers
9913    printf("rax,  = "); _rax.print(); printf("\n");
9914    printf("rbx,  = "); _rbx.print(); printf("\n");
9915    printf("rcx  = "); _rcx.print(); printf("\n");
9916    printf("rdx  = "); _rdx.print(); printf("\n");
9917    printf("rdi  = "); _rdi.print(); printf("\n");
9918    printf("rsi  = "); _rsi.print(); printf("\n");
9919    printf("rbp,  = "); _rbp.print(); printf("\n");
9920    printf("rsp  = "); _rsp.print(); printf("\n");
9921    printf("\n");
9922    // control registers
9923    printf("flgs = "); _eflags.print(); printf("\n");
9924  }
9925};
9926
9927
9928class CPU_State {
9929 public:
9930  FPU_State _fpu_state;
9931  IU_State  _iu_state;
9932
9933  void print() const {
9934    printf("--------------------------------------------------\n");
9935    _iu_state .print();
9936    printf("\n");
9937    _fpu_state.print();
9938    printf("--------------------------------------------------\n");
9939  }
9940
9941};
9942
9943
9944static void _print_CPU_state(CPU_State* state) {
9945  state->print();
9946};
9947
9948
9949void MacroAssembler::print_CPU_state() {
9950  push_CPU_state();
9951  push(rsp);                // pass CPU state
9952  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
9953  addptr(rsp, wordSize);       // discard argument
9954  pop_CPU_state();
9955}
9956
9957
9958static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
9959  static int counter = 0;
9960  FPU_State* fs = &state->_fpu_state;
9961  counter++;
9962  // For leaf calls, only verify that the top few elements remain empty.
9963  // We only need 1 empty at the top for C2 code.
9964  if( stack_depth < 0 ) {
9965    if( fs->tag_for_st(7) != 3 ) {
9966      printf("FPR7 not empty\n");
9967      state->print();
9968      assert(false, "error");
9969      return false;
9970    }
9971    return true;                // All other stack states do not matter
9972  }
9973
9974  assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
9975         "bad FPU control word");
9976
9977  // compute stack depth
9978  int i = 0;
9979  while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
9980  int d = i;
9981  while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
9982  // verify findings
9983  if (i != FPU_State::number_of_registers) {
9984    // stack not contiguous
9985    printf("%s: stack not contiguous at ST%d\n", s, i);
9986    state->print();
9987    assert(false, "error");
9988    return false;
9989  }
9990  // check if computed stack depth corresponds to expected stack depth
9991  if (stack_depth < 0) {
9992    // expected stack depth is -stack_depth or less
9993    if (d > -stack_depth) {
9994      // too many elements on the stack
9995      printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
9996      state->print();
9997      assert(false, "error");
9998      return false;
9999    }
10000  } else {
10001    // expected stack depth is stack_depth
10002    if (d != stack_depth) {
10003      // wrong stack depth
10004      printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
10005      state->print();
10006      assert(false, "error");
10007      return false;
10008    }
10009  }
10010  // everything is cool
10011  return true;
10012}
10013
10014
10015void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
10016  if (!VerifyFPU) return;
10017  push_CPU_state();
10018  push(rsp);                // pass CPU state
10019  ExternalAddress msg((address) s);
10020  // pass message string s
10021  pushptr(msg.addr());
10022  push(stack_depth);        // pass stack depth
10023  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
10024  addptr(rsp, 3 * wordSize);   // discard arguments
10025  // check for error
10026  { Label L;
10027    testl(rax, rax);
10028    jcc(Assembler::notZero, L);
10029    int3();                  // break if error condition
10030    bind(L);
10031  }
10032  pop_CPU_state();
10033}
10034
10035void MacroAssembler::load_klass(Register dst, Register src) {
10036#ifdef _LP64
10037  if (UseCompressedKlassPointers) {
10038    movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
10039    decode_klass_not_null(dst);
10040  } else
10041#endif
10042    movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
10043}
10044
10045void MacroAssembler::load_prototype_header(Register dst, Register src) {
10046#ifdef _LP64
10047  if (UseCompressedKlassPointers) {
10048    assert (Universe::heap() != NULL, "java heap should be initialized");
10049    movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
10050    if (Universe::narrow_klass_shift() != 0) {
10051      assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
10052      assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
10053      movq(dst, Address(r12_heapbase, dst, Address::times_8, Klass::prototype_header_offset()));
10054    } else {
10055      movq(dst, Address(dst, Klass::prototype_header_offset()));
10056    }
10057  } else
10058#endif
10059  {
10060    movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
10061    movptr(dst, Address(dst, Klass::prototype_header_offset()));
10062  }
10063}
10064
10065void MacroAssembler::store_klass(Register dst, Register src) {
10066#ifdef _LP64
10067  if (UseCompressedKlassPointers) {
10068    encode_klass_not_null(src);
10069    movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
10070  } else
10071#endif
10072    movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
10073}
10074
10075void MacroAssembler::load_heap_oop(Register dst, Address src) {
10076#ifdef _LP64
10077  // FIXME: Must change all places where we try to load the klass.
10078  if (UseCompressedOops) {
10079    movl(dst, src);
10080    decode_heap_oop(dst);
10081  } else
10082#endif
10083    movptr(dst, src);
10084}
10085
10086// Doesn't do verfication, generates fixed size code
10087void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
10088#ifdef _LP64
10089  if (UseCompressedOops) {
10090    movl(dst, src);
10091    decode_heap_oop_not_null(dst);
10092  } else
10093#endif
10094    movptr(dst, src);
10095}
10096
10097void MacroAssembler::store_heap_oop(Address dst, Register src) {
10098#ifdef _LP64
10099  if (UseCompressedOops) {
10100    assert(!dst.uses(src), "not enough registers");
10101    encode_heap_oop(src);
10102    movl(dst, src);
10103  } else
10104#endif
10105    movptr(dst, src);
10106}
10107
10108void MacroAssembler::cmp_heap_oop(Register src1, Address src2, Register tmp) {
10109  assert_different_registers(src1, tmp);
10110#ifdef _LP64
10111  if (UseCompressedOops) {
10112    bool did_push = false;
10113    if (tmp == noreg) {
10114      tmp = rax;
10115      push(tmp);
10116      did_push = true;
10117      assert(!src2.uses(rsp), "can't push");
10118    }
10119    load_heap_oop(tmp, src2);
10120    cmpptr(src1, tmp);
10121    if (did_push)  pop(tmp);
10122  } else
10123#endif
10124    cmpptr(src1, src2);
10125}
10126
10127// Used for storing NULLs.
10128void MacroAssembler::store_heap_oop_null(Address dst) {
10129#ifdef _LP64
10130  if (UseCompressedOops) {
10131    movl(dst, (int32_t)NULL_WORD);
10132  } else {
10133    movslq(dst, (int32_t)NULL_WORD);
10134  }
10135#else
10136  movl(dst, (int32_t)NULL_WORD);
10137#endif
10138}
10139
10140#ifdef _LP64
10141void MacroAssembler::store_klass_gap(Register dst, Register src) {
10142  if (UseCompressedKlassPointers) {
10143    // Store to klass gap in destination
10144    movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
10145  }
10146}
10147
10148#ifdef ASSERT
10149void MacroAssembler::verify_heapbase(const char* msg) {
10150  assert (UseCompressedOops || UseCompressedKlassPointers, "should be compressed");
10151  assert (Universe::heap() != NULL, "java heap should be initialized");
10152  if (CheckCompressedOops) {
10153    Label ok;
10154    push(rscratch1); // cmpptr trashes rscratch1
10155    cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
10156    jcc(Assembler::equal, ok);
10157    STOP(msg);
10158    bind(ok);
10159    pop(rscratch1);
10160  }
10161}
10162#endif
10163
10164// Algorithm must match oop.inline.hpp encode_heap_oop.
10165void MacroAssembler::encode_heap_oop(Register r) {
10166#ifdef ASSERT
10167  verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
10168#endif
10169  verify_oop(r, "broken oop in encode_heap_oop");
10170  if (Universe::narrow_oop_base() == NULL) {
10171    if (Universe::narrow_oop_shift() != 0) {
10172      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10173      shrq(r, LogMinObjAlignmentInBytes);
10174    }
10175    return;
10176  }
10177  testq(r, r);
10178  cmovq(Assembler::equal, r, r12_heapbase);
10179  subq(r, r12_heapbase);
10180  shrq(r, LogMinObjAlignmentInBytes);
10181}
10182
10183void MacroAssembler::encode_heap_oop_not_null(Register r) {
10184#ifdef ASSERT
10185  verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
10186  if (CheckCompressedOops) {
10187    Label ok;
10188    testq(r, r);
10189    jcc(Assembler::notEqual, ok);
10190    STOP("null oop passed to encode_heap_oop_not_null");
10191    bind(ok);
10192  }
10193#endif
10194  verify_oop(r, "broken oop in encode_heap_oop_not_null");
10195  if (Universe::narrow_oop_base() != NULL) {
10196    subq(r, r12_heapbase);
10197  }
10198  if (Universe::narrow_oop_shift() != 0) {
10199    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10200    shrq(r, LogMinObjAlignmentInBytes);
10201  }
10202}
10203
10204void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
10205#ifdef ASSERT
10206  verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
10207  if (CheckCompressedOops) {
10208    Label ok;
10209    testq(src, src);
10210    jcc(Assembler::notEqual, ok);
10211    STOP("null oop passed to encode_heap_oop_not_null2");
10212    bind(ok);
10213  }
10214#endif
10215  verify_oop(src, "broken oop in encode_heap_oop_not_null2");
10216  if (dst != src) {
10217    movq(dst, src);
10218  }
10219  if (Universe::narrow_oop_base() != NULL) {
10220    subq(dst, r12_heapbase);
10221  }
10222  if (Universe::narrow_oop_shift() != 0) {
10223    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10224    shrq(dst, LogMinObjAlignmentInBytes);
10225  }
10226}
10227
10228void  MacroAssembler::decode_heap_oop(Register r) {
10229#ifdef ASSERT
10230  verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
10231#endif
10232  if (Universe::narrow_oop_base() == NULL) {
10233    if (Universe::narrow_oop_shift() != 0) {
10234      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10235      shlq(r, LogMinObjAlignmentInBytes);
10236    }
10237  } else {
10238    Label done;
10239    shlq(r, LogMinObjAlignmentInBytes);
10240    jccb(Assembler::equal, done);
10241    addq(r, r12_heapbase);
10242    bind(done);
10243  }
10244  verify_oop(r, "broken oop in decode_heap_oop");
10245}
10246
10247void  MacroAssembler::decode_heap_oop_not_null(Register r) {
10248  // Note: it will change flags
10249  assert (UseCompressedOops, "should only be used for compressed headers");
10250  assert (Universe::heap() != NULL, "java heap should be initialized");
10251  // Cannot assert, unverified entry point counts instructions (see .ad file)
10252  // vtableStubs also counts instructions in pd_code_size_limit.
10253  // Also do not verify_oop as this is called by verify_oop.
10254  if (Universe::narrow_oop_shift() != 0) {
10255    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10256    shlq(r, LogMinObjAlignmentInBytes);
10257    if (Universe::narrow_oop_base() != NULL) {
10258      addq(r, r12_heapbase);
10259    }
10260  } else {
10261    assert (Universe::narrow_oop_base() == NULL, "sanity");
10262  }
10263}
10264
10265void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
10266  // Note: it will change flags
10267  assert (UseCompressedOops, "should only be used for compressed headers");
10268  assert (Universe::heap() != NULL, "java heap should be initialized");
10269  // Cannot assert, unverified entry point counts instructions (see .ad file)
10270  // vtableStubs also counts instructions in pd_code_size_limit.
10271  // Also do not verify_oop as this is called by verify_oop.
10272  if (Universe::narrow_oop_shift() != 0) {
10273    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10274    if (LogMinObjAlignmentInBytes == Address::times_8) {
10275      leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
10276    } else {
10277      if (dst != src) {
10278        movq(dst, src);
10279      }
10280      shlq(dst, LogMinObjAlignmentInBytes);
10281      if (Universe::narrow_oop_base() != NULL) {
10282        addq(dst, r12_heapbase);
10283      }
10284    }
10285  } else {
10286    assert (Universe::narrow_oop_base() == NULL, "sanity");
10287    if (dst != src) {
10288      movq(dst, src);
10289    }
10290  }
10291}
10292
10293void MacroAssembler::encode_klass_not_null(Register r) {
10294  assert(Metaspace::is_initialized(), "metaspace should be initialized");
10295#ifdef ASSERT
10296  verify_heapbase("MacroAssembler::encode_klass_not_null: heap base corrupted?");
10297#endif
10298  if (Universe::narrow_klass_base() != NULL) {
10299    subq(r, r12_heapbase);
10300  }
10301  if (Universe::narrow_klass_shift() != 0) {
10302    assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
10303    shrq(r, LogKlassAlignmentInBytes);
10304  }
10305}
10306
10307void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
10308  assert(Metaspace::is_initialized(), "metaspace should be initialized");
10309#ifdef ASSERT
10310  verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
10311#endif
10312  if (dst != src) {
10313    movq(dst, src);
10314  }
10315  if (Universe::narrow_klass_base() != NULL) {
10316    subq(dst, r12_heapbase);
10317  }
10318  if (Universe::narrow_klass_shift() != 0) {
10319    assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
10320    shrq(dst, LogKlassAlignmentInBytes);
10321  }
10322}
10323
10324void  MacroAssembler::decode_klass_not_null(Register r) {
10325  assert(Metaspace::is_initialized(), "metaspace should be initialized");
10326  // Note: it will change flags
10327  assert (UseCompressedKlassPointers, "should only be used for compressed headers");
10328  // Cannot assert, unverified entry point counts instructions (see .ad file)
10329  // vtableStubs also counts instructions in pd_code_size_limit.
10330  // Also do not verify_oop as this is called by verify_oop.
10331  if (Universe::narrow_klass_shift() != 0) {
10332    assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
10333    shlq(r, LogKlassAlignmentInBytes);
10334    if (Universe::narrow_klass_base() != NULL) {
10335      addq(r, r12_heapbase);
10336    }
10337  } else {
10338    assert (Universe::narrow_klass_base() == NULL, "sanity");
10339  }
10340}
10341
10342void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
10343  assert(Metaspace::is_initialized(), "metaspace should be initialized");
10344  // Note: it will change flags
10345  assert (UseCompressedKlassPointers, "should only be used for compressed headers");
10346  // Cannot assert, unverified entry point counts instructions (see .ad file)
10347  // vtableStubs also counts instructions in pd_code_size_limit.
10348  // Also do not verify_oop as this is called by verify_oop.
10349  if (Universe::narrow_klass_shift() != 0) {
10350    assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
10351    assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
10352    leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
10353  } else {
10354    assert (Universe::narrow_klass_base() == NULL, "sanity");
10355    if (dst != src) {
10356      movq(dst, src);
10357    }
10358  }
10359}
10360
10361void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
10362  assert (UseCompressedOops, "should only be used for compressed headers");
10363  assert (Universe::heap() != NULL, "java heap should be initialized");
10364  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10365  int oop_index = oop_recorder()->find_index(obj);
10366  RelocationHolder rspec = oop_Relocation::spec(oop_index);
10367  mov_narrow_oop(dst, oop_index, rspec);
10368}
10369
10370void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
10371  assert (UseCompressedOops, "should only be used for compressed headers");
10372  assert (Universe::heap() != NULL, "java heap should be initialized");
10373  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10374  int oop_index = oop_recorder()->find_index(obj);
10375  RelocationHolder rspec = oop_Relocation::spec(oop_index);
10376  mov_narrow_oop(dst, oop_index, rspec);
10377}
10378
10379void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
10380  assert (UseCompressedKlassPointers, "should only be used for compressed headers");
10381  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10382  int klass_index = oop_recorder()->find_index(k);
10383  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
10384  mov_narrow_oop(dst, oopDesc::encode_klass(k), rspec);
10385}
10386
10387void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
10388  assert (UseCompressedKlassPointers, "should only be used for compressed headers");
10389  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10390  int klass_index = oop_recorder()->find_index(k);
10391  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
10392  mov_narrow_oop(dst, oopDesc::encode_klass(k), rspec);
10393}
10394
10395void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
10396  assert (UseCompressedOops, "should only be used for compressed headers");
10397  assert (Universe::heap() != NULL, "java heap should be initialized");
10398  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10399  int oop_index = oop_recorder()->find_index(obj);
10400  RelocationHolder rspec = oop_Relocation::spec(oop_index);
10401  Assembler::cmp_narrow_oop(dst, oop_index, rspec);
10402}
10403
10404void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
10405  assert (UseCompressedOops, "should only be used for compressed headers");
10406  assert (Universe::heap() != NULL, "java heap should be initialized");
10407  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10408  int oop_index = oop_recorder()->find_index(obj);
10409  RelocationHolder rspec = oop_Relocation::spec(oop_index);
10410  Assembler::cmp_narrow_oop(dst, oop_index, rspec);
10411}
10412
10413void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
10414  assert (UseCompressedKlassPointers, "should only be used for compressed headers");
10415  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10416  int klass_index = oop_recorder()->find_index(k);
10417  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
10418  Assembler::cmp_narrow_oop(dst, oopDesc::encode_klass(k), rspec);
10419}
10420
10421void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
10422  assert (UseCompressedKlassPointers, "should only be used for compressed headers");
10423  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10424  int klass_index = oop_recorder()->find_index(k);
10425  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
10426  Assembler::cmp_narrow_oop(dst, oopDesc::encode_klass(k), rspec);
10427}
10428
10429void MacroAssembler::reinit_heapbase() {
10430  if (UseCompressedOops || UseCompressedKlassPointers) {
10431    movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
10432  }
10433}
10434#endif // _LP64
10435
10436
10437// C2 compiled method's prolog code.
10438void MacroAssembler::verified_entry(int framesize, bool stack_bang, bool fp_mode_24b) {
10439
10440  // WARNING: Initial instruction MUST be 5 bytes or longer so that
10441  // NativeJump::patch_verified_entry will be able to patch out the entry
10442  // code safely. The push to verify stack depth is ok at 5 bytes,
10443  // the frame allocation can be either 3 or 6 bytes. So if we don't do
10444  // stack bang then we must use the 6 byte frame allocation even if
10445  // we have no frame. :-(
10446
10447  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
10448  // Remove word for return addr
10449  framesize -= wordSize;
10450
10451  // Calls to C2R adapters often do not accept exceptional returns.
10452  // We require that their callers must bang for them.  But be careful, because
10453  // some VM calls (such as call site linkage) can use several kilobytes of
10454  // stack.  But the stack safety zone should account for that.
10455  // See bugs 4446381, 4468289, 4497237.
10456  if (stack_bang) {
10457    generate_stack_overflow_check(framesize);
10458
10459    // We always push rbp, so that on return to interpreter rbp, will be
10460    // restored correctly and we can correct the stack.
10461    push(rbp);
10462    // Remove word for ebp
10463    framesize -= wordSize;
10464
10465    // Create frame
10466    if (framesize) {
10467      subptr(rsp, framesize);
10468    }
10469  } else {
10470    // Create frame (force generation of a 4 byte immediate value)
10471    subptr_imm32(rsp, framesize);
10472
10473    // Save RBP register now.
10474    framesize -= wordSize;
10475    movptr(Address(rsp, framesize), rbp);
10476  }
10477
10478  if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
10479    framesize -= wordSize;
10480    movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
10481  }
10482
10483#ifndef _LP64
10484  // If method sets FPU control word do it now
10485  if (fp_mode_24b) {
10486    fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
10487  }
10488  if (UseSSE >= 2 && VerifyFPU) {
10489    verify_FPU(0, "FPU stack must be clean on entry");
10490  }
10491#endif
10492
10493#ifdef ASSERT
10494  if (VerifyStackAtCalls) {
10495    Label L;
10496    push(rax);
10497    mov(rax, rsp);
10498    andptr(rax, StackAlignmentInBytes-1);
10499    cmpptr(rax, StackAlignmentInBytes-wordSize);
10500    pop(rax);
10501    jcc(Assembler::equal, L);
10502    STOP("Stack is not properly aligned!");
10503    bind(L);
10504  }
10505#endif
10506
10507}
10508
10509
10510// IndexOf for constant substrings with size >= 8 chars
10511// which don't need to be loaded through stack.
10512void MacroAssembler::string_indexofC8(Register str1, Register str2,
10513                                      Register cnt1, Register cnt2,
10514                                      int int_cnt2,  Register result,
10515                                      XMMRegister vec, Register tmp) {
10516  ShortBranchVerifier sbv(this);
10517  assert(UseSSE42Intrinsics, "SSE4.2 is required");
10518
10519  // This method uses pcmpestri inxtruction with bound registers
10520  //   inputs:
10521  //     xmm - substring
10522  //     rax - substring length (elements count)
10523  //     mem - scanned string
10524  //     rdx - string length (elements count)
10525  //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
10526  //   outputs:
10527  //     rcx - matched index in string
10528  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
10529
10530  Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
10531        RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
10532        MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
10533
10534  // Note, inline_string_indexOf() generates checks:
10535  // if (substr.count > string.count) return -1;
10536  // if (substr.count == 0) return 0;
10537  assert(int_cnt2 >= 8, "this code isused only for cnt2 >= 8 chars");
10538
10539  // Load substring.
10540  movdqu(vec, Address(str2, 0));
10541  movl(cnt2, int_cnt2);
10542  movptr(result, str1); // string addr
10543
10544  if (int_cnt2 > 8) {
10545    jmpb(SCAN_TO_SUBSTR);
10546
10547    // Reload substr for rescan, this code
10548    // is executed only for large substrings (> 8 chars)
10549    bind(RELOAD_SUBSTR);
10550    movdqu(vec, Address(str2, 0));
10551    negptr(cnt2); // Jumped here with negative cnt2, convert to positive
10552
10553    bind(RELOAD_STR);
10554    // We came here after the beginning of the substring was
10555    // matched but the rest of it was not so we need to search
10556    // again. Start from the next element after the previous match.
10557
10558    // cnt2 is number of substring reminding elements and
10559    // cnt1 is number of string reminding elements when cmp failed.
10560    // Restored cnt1 = cnt1 - cnt2 + int_cnt2
10561    subl(cnt1, cnt2);
10562    addl(cnt1, int_cnt2);
10563    movl(cnt2, int_cnt2); // Now restore cnt2
10564
10565    decrementl(cnt1);     // Shift to next element
10566    cmpl(cnt1, cnt2);
10567    jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10568
10569    addptr(result, 2);
10570
10571  } // (int_cnt2 > 8)
10572
10573  // Scan string for start of substr in 16-byte vectors
10574  bind(SCAN_TO_SUBSTR);
10575  pcmpestri(vec, Address(result, 0), 0x0d);
10576  jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
10577  subl(cnt1, 8);
10578  jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
10579  cmpl(cnt1, cnt2);
10580  jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10581  addptr(result, 16);
10582  jmpb(SCAN_TO_SUBSTR);
10583
10584  // Found a potential substr
10585  bind(FOUND_CANDIDATE);
10586  // Matched whole vector if first element matched (tmp(rcx) == 0).
10587  if (int_cnt2 == 8) {
10588    jccb(Assembler::overflow, RET_FOUND);    // OF == 1
10589  } else { // int_cnt2 > 8
10590    jccb(Assembler::overflow, FOUND_SUBSTR);
10591  }
10592  // After pcmpestri tmp(rcx) contains matched element index
10593  // Compute start addr of substr
10594  lea(result, Address(result, tmp, Address::times_2));
10595
10596  // Make sure string is still long enough
10597  subl(cnt1, tmp);
10598  cmpl(cnt1, cnt2);
10599  if (int_cnt2 == 8) {
10600    jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
10601  } else { // int_cnt2 > 8
10602    jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
10603  }
10604  // Left less then substring.
10605
10606  bind(RET_NOT_FOUND);
10607  movl(result, -1);
10608  jmpb(EXIT);
10609
10610  if (int_cnt2 > 8) {
10611    // This code is optimized for the case when whole substring
10612    // is matched if its head is matched.
10613    bind(MATCH_SUBSTR_HEAD);
10614    pcmpestri(vec, Address(result, 0), 0x0d);
10615    // Reload only string if does not match
10616    jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0
10617
10618    Label CONT_SCAN_SUBSTR;
10619    // Compare the rest of substring (> 8 chars).
10620    bind(FOUND_SUBSTR);
10621    // First 8 chars are already matched.
10622    negptr(cnt2);
10623    addptr(cnt2, 8);
10624
10625    bind(SCAN_SUBSTR);
10626    subl(cnt1, 8);
10627    cmpl(cnt2, -8); // Do not read beyond substring
10628    jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
10629    // Back-up strings to avoid reading beyond substring:
10630    // cnt1 = cnt1 - cnt2 + 8
10631    addl(cnt1, cnt2); // cnt2 is negative
10632    addl(cnt1, 8);
10633    movl(cnt2, 8); negptr(cnt2);
10634    bind(CONT_SCAN_SUBSTR);
10635    if (int_cnt2 < (int)G) {
10636      movdqu(vec, Address(str2, cnt2, Address::times_2, int_cnt2*2));
10637      pcmpestri(vec, Address(result, cnt2, Address::times_2, int_cnt2*2), 0x0d);
10638    } else {
10639      // calculate index in register to avoid integer overflow (int_cnt2*2)
10640      movl(tmp, int_cnt2);
10641      addptr(tmp, cnt2);
10642      movdqu(vec, Address(str2, tmp, Address::times_2, 0));
10643      pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d);
10644    }
10645    // Need to reload strings pointers if not matched whole vector
10646    jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
10647    addptr(cnt2, 8);
10648    jcc(Assembler::negative, SCAN_SUBSTR);
10649    // Fall through if found full substring
10650
10651  } // (int_cnt2 > 8)
10652
10653  bind(RET_FOUND);
10654  // Found result if we matched full small substring.
10655  // Compute substr offset
10656  subptr(result, str1);
10657  shrl(result, 1); // index
10658  bind(EXIT);
10659
10660} // string_indexofC8
10661
10662// Small strings are loaded through stack if they cross page boundary.
10663void MacroAssembler::string_indexof(Register str1, Register str2,
10664                                    Register cnt1, Register cnt2,
10665                                    int int_cnt2,  Register result,
10666                                    XMMRegister vec, Register tmp) {
10667  ShortBranchVerifier sbv(this);
10668  assert(UseSSE42Intrinsics, "SSE4.2 is required");
10669  //
10670  // int_cnt2 is length of small (< 8 chars) constant substring
10671  // or (-1) for non constant substring in which case its length
10672  // is in cnt2 register.
10673  //
10674  // Note, inline_string_indexOf() generates checks:
10675  // if (substr.count > string.count) return -1;
10676  // if (substr.count == 0) return 0;
10677  //
10678  assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < 8), "should be != 0");
10679
10680  // This method uses pcmpestri inxtruction with bound registers
10681  //   inputs:
10682  //     xmm - substring
10683  //     rax - substring length (elements count)
10684  //     mem - scanned string
10685  //     rdx - string length (elements count)
10686  //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
10687  //   outputs:
10688  //     rcx - matched index in string
10689  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
10690
10691  Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
10692        RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
10693        FOUND_CANDIDATE;
10694
10695  { //========================================================
10696    // We don't know where these strings are located
10697    // and we can't read beyond them. Load them through stack.
10698    Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
10699
10700    movptr(tmp, rsp); // save old SP
10701
10702    if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
10703      if (int_cnt2 == 1) {  // One char
10704        load_unsigned_short(result, Address(str2, 0));
10705        movdl(vec, result); // move 32 bits
10706      } else if (int_cnt2 == 2) { // Two chars
10707        movdl(vec, Address(str2, 0)); // move 32 bits
10708      } else if (int_cnt2 == 4) { // Four chars
10709        movq(vec, Address(str2, 0));  // move 64 bits
10710      } else { // cnt2 = { 3, 5, 6, 7 }
10711        // Array header size is 12 bytes in 32-bit VM
10712        // + 6 bytes for 3 chars == 18 bytes,
10713        // enough space to load vec and shift.
10714        assert(HeapWordSize*typeArrayKlass::header_size() >= 12,"sanity");
10715        movdqu(vec, Address(str2, (int_cnt2*2)-16));
10716        psrldq(vec, 16-(int_cnt2*2));
10717      }
10718    } else { // not constant substring
10719      cmpl(cnt2, 8);
10720      jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
10721
10722      // We can read beyond string if srt+16 does not cross page boundary
10723      // since heaps are aligned and mapped by pages.
10724      assert(os::vm_page_size() < (int)G, "default page should be small");
10725      movl(result, str2); // We need only low 32 bits
10726      andl(result, (os::vm_page_size()-1));
10727      cmpl(result, (os::vm_page_size()-16));
10728      jccb(Assembler::belowEqual, CHECK_STR);
10729
10730      // Move small strings to stack to allow load 16 bytes into vec.
10731      subptr(rsp, 16);
10732      int stk_offset = wordSize-2;
10733      push(cnt2);
10734
10735      bind(COPY_SUBSTR);
10736      load_unsigned_short(result, Address(str2, cnt2, Address::times_2, -2));
10737      movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
10738      decrement(cnt2);
10739      jccb(Assembler::notZero, COPY_SUBSTR);
10740
10741      pop(cnt2);
10742      movptr(str2, rsp);  // New substring address
10743    } // non constant
10744
10745    bind(CHECK_STR);
10746    cmpl(cnt1, 8);
10747    jccb(Assembler::aboveEqual, BIG_STRINGS);
10748
10749    // Check cross page boundary.
10750    movl(result, str1); // We need only low 32 bits
10751    andl(result, (os::vm_page_size()-1));
10752    cmpl(result, (os::vm_page_size()-16));
10753    jccb(Assembler::belowEqual, BIG_STRINGS);
10754
10755    subptr(rsp, 16);
10756    int stk_offset = -2;
10757    if (int_cnt2 < 0) { // not constant
10758      push(cnt2);
10759      stk_offset += wordSize;
10760    }
10761    movl(cnt2, cnt1);
10762
10763    bind(COPY_STR);
10764    load_unsigned_short(result, Address(str1, cnt2, Address::times_2, -2));
10765    movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
10766    decrement(cnt2);
10767    jccb(Assembler::notZero, COPY_STR);
10768
10769    if (int_cnt2 < 0) { // not constant
10770      pop(cnt2);
10771    }
10772    movptr(str1, rsp);  // New string address
10773
10774    bind(BIG_STRINGS);
10775    // Load substring.
10776    if (int_cnt2 < 0) { // -1
10777      movdqu(vec, Address(str2, 0));
10778      push(cnt2);       // substr count
10779      push(str2);       // substr addr
10780      push(str1);       // string addr
10781    } else {
10782      // Small (< 8 chars) constant substrings are loaded already.
10783      movl(cnt2, int_cnt2);
10784    }
10785    push(tmp);  // original SP
10786
10787  } // Finished loading
10788
10789  //========================================================
10790  // Start search
10791  //
10792
10793  movptr(result, str1); // string addr
10794
10795  if (int_cnt2  < 0) {  // Only for non constant substring
10796    jmpb(SCAN_TO_SUBSTR);
10797
10798    // SP saved at sp+0
10799    // String saved at sp+1*wordSize
10800    // Substr saved at sp+2*wordSize
10801    // Substr count saved at sp+3*wordSize
10802
10803    // Reload substr for rescan, this code
10804    // is executed only for large substrings (> 8 chars)
10805    bind(RELOAD_SUBSTR);
10806    movptr(str2, Address(rsp, 2*wordSize));
10807    movl(cnt2, Address(rsp, 3*wordSize));
10808    movdqu(vec, Address(str2, 0));
10809    // We came here after the beginning of the substring was
10810    // matched but the rest of it was not so we need to search
10811    // again. Start from the next element after the previous match.
10812    subptr(str1, result); // Restore counter
10813    shrl(str1, 1);
10814    addl(cnt1, str1);
10815    decrementl(cnt1);   // Shift to next element
10816    cmpl(cnt1, cnt2);
10817    jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10818
10819    addptr(result, 2);
10820  } // non constant
10821
10822  // Scan string for start of substr in 16-byte vectors
10823  bind(SCAN_TO_SUBSTR);
10824  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
10825  pcmpestri(vec, Address(result, 0), 0x0d);
10826  jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
10827  subl(cnt1, 8);
10828  jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
10829  cmpl(cnt1, cnt2);
10830  jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10831  addptr(result, 16);
10832
10833  bind(ADJUST_STR);
10834  cmpl(cnt1, 8); // Do not read beyond string
10835  jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
10836  // Back-up string to avoid reading beyond string.
10837  lea(result, Address(result, cnt1, Address::times_2, -16));
10838  movl(cnt1, 8);
10839  jmpb(SCAN_TO_SUBSTR);
10840
10841  // Found a potential substr
10842  bind(FOUND_CANDIDATE);
10843  // After pcmpestri tmp(rcx) contains matched element index
10844
10845  // Make sure string is still long enough
10846  subl(cnt1, tmp);
10847  cmpl(cnt1, cnt2);
10848  jccb(Assembler::greaterEqual, FOUND_SUBSTR);
10849  // Left less then substring.
10850
10851  bind(RET_NOT_FOUND);
10852  movl(result, -1);
10853  jmpb(CLEANUP);
10854
10855  bind(FOUND_SUBSTR);
10856  // Compute start addr of substr
10857  lea(result, Address(result, tmp, Address::times_2));
10858
10859  if (int_cnt2 > 0) { // Constant substring
10860    // Repeat search for small substring (< 8 chars)
10861    // from new point without reloading substring.
10862    // Have to check that we don't read beyond string.
10863    cmpl(tmp, 8-int_cnt2);
10864    jccb(Assembler::greater, ADJUST_STR);
10865    // Fall through if matched whole substring.
10866  } else { // non constant
10867    assert(int_cnt2 == -1, "should be != 0");
10868
10869    addl(tmp, cnt2);
10870    // Found result if we matched whole substring.
10871    cmpl(tmp, 8);
10872    jccb(Assembler::lessEqual, RET_FOUND);
10873
10874    // Repeat search for small substring (<= 8 chars)
10875    // from new point 'str1' without reloading substring.
10876    cmpl(cnt2, 8);
10877    // Have to check that we don't read beyond string.
10878    jccb(Assembler::lessEqual, ADJUST_STR);
10879
10880    Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
10881    // Compare the rest of substring (> 8 chars).
10882    movptr(str1, result);
10883
10884    cmpl(tmp, cnt2);
10885    // First 8 chars are already matched.
10886    jccb(Assembler::equal, CHECK_NEXT);
10887
10888    bind(SCAN_SUBSTR);
10889    pcmpestri(vec, Address(str1, 0), 0x0d);
10890    // Need to reload strings pointers if not matched whole vector
10891    jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
10892
10893    bind(CHECK_NEXT);
10894    subl(cnt2, 8);
10895    jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
10896    addptr(str1, 16);
10897    addptr(str2, 16);
10898    subl(cnt1, 8);
10899    cmpl(cnt2, 8); // Do not read beyond substring
10900    jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
10901    // Back-up strings to avoid reading beyond substring.
10902    lea(str2, Address(str2, cnt2, Address::times_2, -16));
10903    lea(str1, Address(str1, cnt2, Address::times_2, -16));
10904    subl(cnt1, cnt2);
10905    movl(cnt2, 8);
10906    addl(cnt1, 8);
10907    bind(CONT_SCAN_SUBSTR);
10908    movdqu(vec, Address(str2, 0));
10909    jmpb(SCAN_SUBSTR);
10910
10911    bind(RET_FOUND_LONG);
10912    movptr(str1, Address(rsp, wordSize));
10913  } // non constant
10914
10915  bind(RET_FOUND);
10916  // Compute substr offset
10917  subptr(result, str1);
10918  shrl(result, 1); // index
10919
10920  bind(CLEANUP);
10921  pop(rsp); // restore SP
10922
10923} // string_indexof
10924
10925// Compare strings.
10926void MacroAssembler::string_compare(Register str1, Register str2,
10927                                    Register cnt1, Register cnt2, Register result,
10928                                    XMMRegister vec1) {
10929  ShortBranchVerifier sbv(this);
10930  Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
10931
10932  // Compute the minimum of the string lengths and the
10933  // difference of the string lengths (stack).
10934  // Do the conditional move stuff
10935  movl(result, cnt1);
10936  subl(cnt1, cnt2);
10937  push(cnt1);
10938  cmov32(Assembler::lessEqual, cnt2, result);
10939
10940  // Is the minimum length zero?
10941  testl(cnt2, cnt2);
10942  jcc(Assembler::zero, LENGTH_DIFF_LABEL);
10943
10944  // Load first characters
10945  load_unsigned_short(result, Address(str1, 0));
10946  load_unsigned_short(cnt1, Address(str2, 0));
10947
10948  // Compare first characters
10949  subl(result, cnt1);
10950  jcc(Assembler::notZero,  POP_LABEL);
10951  decrementl(cnt2);
10952  jcc(Assembler::zero, LENGTH_DIFF_LABEL);
10953
10954  {
10955    // Check after comparing first character to see if strings are equivalent
10956    Label LSkip2;
10957    // Check if the strings start at same location
10958    cmpptr(str1, str2);
10959    jccb(Assembler::notEqual, LSkip2);
10960
10961    // Check if the length difference is zero (from stack)
10962    cmpl(Address(rsp, 0), 0x0);
10963    jcc(Assembler::equal,  LENGTH_DIFF_LABEL);
10964
10965    // Strings might not be equivalent
10966    bind(LSkip2);
10967  }
10968
10969  Address::ScaleFactor scale = Address::times_2;
10970  int stride = 8;
10971
10972  // Advance to next element
10973  addptr(str1, 16/stride);
10974  addptr(str2, 16/stride);
10975
10976  if (UseSSE42Intrinsics) {
10977    Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
10978    int pcmpmask = 0x19;
10979    // Setup to compare 16-byte vectors
10980    movl(result, cnt2);
10981    andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
10982    jccb(Assembler::zero, COMPARE_TAIL);
10983
10984    lea(str1, Address(str1, result, scale));
10985    lea(str2, Address(str2, result, scale));
10986    negptr(result);
10987
10988    // pcmpestri
10989    //   inputs:
10990    //     vec1- substring
10991    //     rax - negative string length (elements count)
10992    //     mem - scaned string
10993    //     rdx - string length (elements count)
10994    //     pcmpmask - cmp mode: 11000 (string compare with negated result)
10995    //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
10996    //   outputs:
10997    //     rcx - first mismatched element index
10998    assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
10999
11000    bind(COMPARE_WIDE_VECTORS);
11001    movdqu(vec1, Address(str1, result, scale));
11002    pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
11003    // After pcmpestri cnt1(rcx) contains mismatched element index
11004
11005    jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
11006    addptr(result, stride);
11007    subptr(cnt2, stride);
11008    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
11009
11010    // compare wide vectors tail
11011    testl(result, result);
11012    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
11013
11014    movl(cnt2, stride);
11015    movl(result, stride);
11016    negptr(result);
11017    movdqu(vec1, Address(str1, result, scale));
11018    pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
11019    jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
11020
11021    // Mismatched characters in the vectors
11022    bind(VECTOR_NOT_EQUAL);
11023    addptr(result, cnt1);
11024    movptr(cnt2, result);
11025    load_unsigned_short(result, Address(str1, cnt2, scale));
11026    load_unsigned_short(cnt1, Address(str2, cnt2, scale));
11027    subl(result, cnt1);
11028    jmpb(POP_LABEL);
11029
11030    bind(COMPARE_TAIL); // limit is zero
11031    movl(cnt2, result);
11032    // Fallthru to tail compare
11033  }
11034
11035  // Shift str2 and str1 to the end of the arrays, negate min
11036  lea(str1, Address(str1, cnt2, scale, 0));
11037  lea(str2, Address(str2, cnt2, scale, 0));
11038  negptr(cnt2);
11039
11040  // Compare the rest of the elements
11041  bind(WHILE_HEAD_LABEL);
11042  load_unsigned_short(result, Address(str1, cnt2, scale, 0));
11043  load_unsigned_short(cnt1, Address(str2, cnt2, scale, 0));
11044  subl(result, cnt1);
11045  jccb(Assembler::notZero, POP_LABEL);
11046  increment(cnt2);
11047  jccb(Assembler::notZero, WHILE_HEAD_LABEL);
11048
11049  // Strings are equal up to min length.  Return the length difference.
11050  bind(LENGTH_DIFF_LABEL);
11051  pop(result);
11052  jmpb(DONE_LABEL);
11053
11054  // Discard the stored length difference
11055  bind(POP_LABEL);
11056  pop(cnt1);
11057
11058  // That's it
11059  bind(DONE_LABEL);
11060}
11061
11062// Compare char[] arrays aligned to 4 bytes or substrings.
11063void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2,
11064                                        Register limit, Register result, Register chr,
11065                                        XMMRegister vec1, XMMRegister vec2) {
11066  ShortBranchVerifier sbv(this);
11067  Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
11068
11069  int length_offset  = arrayOopDesc::length_offset_in_bytes();
11070  int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
11071
11072  // Check the input args
11073  cmpptr(ary1, ary2);
11074  jcc(Assembler::equal, TRUE_LABEL);
11075
11076  if (is_array_equ) {
11077    // Need additional checks for arrays_equals.
11078    testptr(ary1, ary1);
11079    jcc(Assembler::zero, FALSE_LABEL);
11080    testptr(ary2, ary2);
11081    jcc(Assembler::zero, FALSE_LABEL);
11082
11083    // Check the lengths
11084    movl(limit, Address(ary1, length_offset));
11085    cmpl(limit, Address(ary2, length_offset));
11086    jcc(Assembler::notEqual, FALSE_LABEL);
11087  }
11088
11089  // count == 0
11090  testl(limit, limit);
11091  jcc(Assembler::zero, TRUE_LABEL);
11092
11093  if (is_array_equ) {
11094    // Load array address
11095    lea(ary1, Address(ary1, base_offset));
11096    lea(ary2, Address(ary2, base_offset));
11097  }
11098
11099  shll(limit, 1);      // byte count != 0
11100  movl(result, limit); // copy
11101
11102  if (UseSSE42Intrinsics) {
11103    // With SSE4.2, use double quad vector compare
11104    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
11105
11106    // Compare 16-byte vectors
11107    andl(result, 0x0000000e);  //   tail count (in bytes)
11108    andl(limit, 0xfffffff0);   // vector count (in bytes)
11109    jccb(Assembler::zero, COMPARE_TAIL);
11110
11111    lea(ary1, Address(ary1, limit, Address::times_1));
11112    lea(ary2, Address(ary2, limit, Address::times_1));
11113    negptr(limit);
11114
11115    bind(COMPARE_WIDE_VECTORS);
11116    movdqu(vec1, Address(ary1, limit, Address::times_1));
11117    movdqu(vec2, Address(ary2, limit, Address::times_1));
11118    pxor(vec1, vec2);
11119
11120    ptest(vec1, vec1);
11121    jccb(Assembler::notZero, FALSE_LABEL);
11122    addptr(limit, 16);
11123    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
11124
11125    testl(result, result);
11126    jccb(Assembler::zero, TRUE_LABEL);
11127
11128    movdqu(vec1, Address(ary1, result, Address::times_1, -16));
11129    movdqu(vec2, Address(ary2, result, Address::times_1, -16));
11130    pxor(vec1, vec2);
11131
11132    ptest(vec1, vec1);
11133    jccb(Assembler::notZero, FALSE_LABEL);
11134    jmpb(TRUE_LABEL);
11135
11136    bind(COMPARE_TAIL); // limit is zero
11137    movl(limit, result);
11138    // Fallthru to tail compare
11139  }
11140
11141  // Compare 4-byte vectors
11142  andl(limit, 0xfffffffc); // vector count (in bytes)
11143  jccb(Assembler::zero, COMPARE_CHAR);
11144
11145  lea(ary1, Address(ary1, limit, Address::times_1));
11146  lea(ary2, Address(ary2, limit, Address::times_1));
11147  negptr(limit);
11148
11149  bind(COMPARE_VECTORS);
11150  movl(chr, Address(ary1, limit, Address::times_1));
11151  cmpl(chr, Address(ary2, limit, Address::times_1));
11152  jccb(Assembler::notEqual, FALSE_LABEL);
11153  addptr(limit, 4);
11154  jcc(Assembler::notZero, COMPARE_VECTORS);
11155
11156  // Compare trailing char (final 2 bytes), if any
11157  bind(COMPARE_CHAR);
11158  testl(result, 0x2);   // tail  char
11159  jccb(Assembler::zero, TRUE_LABEL);
11160  load_unsigned_short(chr, Address(ary1, 0));
11161  load_unsigned_short(limit, Address(ary2, 0));
11162  cmpl(chr, limit);
11163  jccb(Assembler::notEqual, FALSE_LABEL);
11164
11165  bind(TRUE_LABEL);
11166  movl(result, 1);   // return true
11167  jmpb(DONE);
11168
11169  bind(FALSE_LABEL);
11170  xorl(result, result); // return false
11171
11172  // That's it
11173  bind(DONE);
11174}
11175
11176void MacroAssembler::generate_fill(BasicType t, bool aligned,
11177                                   Register to, Register value, Register count,
11178                                   Register rtmp, XMMRegister xtmp) {
11179  ShortBranchVerifier sbv(this);
11180  assert_different_registers(to, value, count, rtmp);
11181  Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
11182  Label L_fill_2_bytes, L_fill_4_bytes;
11183
11184  int shift = -1;
11185  switch (t) {
11186    case T_BYTE:
11187      shift = 2;
11188      break;
11189    case T_SHORT:
11190      shift = 1;
11191      break;
11192    case T_INT:
11193      shift = 0;
11194      break;
11195    default: ShouldNotReachHere();
11196  }
11197
11198  if (t == T_BYTE) {
11199    andl(value, 0xff);
11200    movl(rtmp, value);
11201    shll(rtmp, 8);
11202    orl(value, rtmp);
11203  }
11204  if (t == T_SHORT) {
11205    andl(value, 0xffff);
11206  }
11207  if (t == T_BYTE || t == T_SHORT) {
11208    movl(rtmp, value);
11209    shll(rtmp, 16);
11210    orl(value, rtmp);
11211  }
11212
11213  cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
11214  jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
11215  if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
11216    // align source address at 4 bytes address boundary
11217    if (t == T_BYTE) {
11218      // One byte misalignment happens only for byte arrays
11219      testptr(to, 1);
11220      jccb(Assembler::zero, L_skip_align1);
11221      movb(Address(to, 0), value);
11222      increment(to);
11223      decrement(count);
11224      BIND(L_skip_align1);
11225    }
11226    // Two bytes misalignment happens only for byte and short (char) arrays
11227    testptr(to, 2);
11228    jccb(Assembler::zero, L_skip_align2);
11229    movw(Address(to, 0), value);
11230    addptr(to, 2);
11231    subl(count, 1<<(shift-1));
11232    BIND(L_skip_align2);
11233  }
11234  if (UseSSE < 2) {
11235    Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
11236    // Fill 32-byte chunks
11237    subl(count, 8 << shift);
11238    jcc(Assembler::less, L_check_fill_8_bytes);
11239    align(16);
11240
11241    BIND(L_fill_32_bytes_loop);
11242
11243    for (int i = 0; i < 32; i += 4) {
11244      movl(Address(to, i), value);
11245    }
11246
11247    addptr(to, 32);
11248    subl(count, 8 << shift);
11249    jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
11250    BIND(L_check_fill_8_bytes);
11251    addl(count, 8 << shift);
11252    jccb(Assembler::zero, L_exit);
11253    jmpb(L_fill_8_bytes);
11254
11255    //
11256    // length is too short, just fill qwords
11257    //
11258    BIND(L_fill_8_bytes_loop);
11259    movl(Address(to, 0), value);
11260    movl(Address(to, 4), value);
11261    addptr(to, 8);
11262    BIND(L_fill_8_bytes);
11263    subl(count, 1 << (shift + 1));
11264    jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
11265    // fall through to fill 4 bytes
11266  } else {
11267    Label L_fill_32_bytes;
11268    if (!UseUnalignedLoadStores) {
11269      // align to 8 bytes, we know we are 4 byte aligned to start
11270      testptr(to, 4);
11271      jccb(Assembler::zero, L_fill_32_bytes);
11272      movl(Address(to, 0), value);
11273      addptr(to, 4);
11274      subl(count, 1<<shift);
11275    }
11276    BIND(L_fill_32_bytes);
11277    {
11278      assert( UseSSE >= 2, "supported cpu only" );
11279      Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
11280      // Fill 32-byte chunks
11281      movdl(xtmp, value);
11282      pshufd(xtmp, xtmp, 0);
11283
11284      subl(count, 8 << shift);
11285      jcc(Assembler::less, L_check_fill_8_bytes);
11286      align(16);
11287
11288      BIND(L_fill_32_bytes_loop);
11289
11290      if (UseUnalignedLoadStores) {
11291        movdqu(Address(to, 0), xtmp);
11292        movdqu(Address(to, 16), xtmp);
11293      } else {
11294        movq(Address(to, 0), xtmp);
11295        movq(Address(to, 8), xtmp);
11296        movq(Address(to, 16), xtmp);
11297        movq(Address(to, 24), xtmp);
11298      }
11299
11300      addptr(to, 32);
11301      subl(count, 8 << shift);
11302      jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
11303      BIND(L_check_fill_8_bytes);
11304      addl(count, 8 << shift);
11305      jccb(Assembler::zero, L_exit);
11306      jmpb(L_fill_8_bytes);
11307
11308      //
11309      // length is too short, just fill qwords
11310      //
11311      BIND(L_fill_8_bytes_loop);
11312      movq(Address(to, 0), xtmp);
11313      addptr(to, 8);
11314      BIND(L_fill_8_bytes);
11315      subl(count, 1 << (shift + 1));
11316      jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
11317    }
11318  }
11319  // fill trailing 4 bytes
11320  BIND(L_fill_4_bytes);
11321  testl(count, 1<<shift);
11322  jccb(Assembler::zero, L_fill_2_bytes);
11323  movl(Address(to, 0), value);
11324  if (t == T_BYTE || t == T_SHORT) {
11325    addptr(to, 4);
11326    BIND(L_fill_2_bytes);
11327    // fill trailing 2 bytes
11328    testl(count, 1<<(shift-1));
11329    jccb(Assembler::zero, L_fill_byte);
11330    movw(Address(to, 0), value);
11331    if (t == T_BYTE) {
11332      addptr(to, 2);
11333      BIND(L_fill_byte);
11334      // fill trailing byte
11335      testl(count, 1);
11336      jccb(Assembler::zero, L_exit);
11337      movb(Address(to, 0), value);
11338    } else {
11339      BIND(L_fill_byte);
11340    }
11341  } else {
11342    BIND(L_fill_2_bytes);
11343  }
11344  BIND(L_exit);
11345}
11346#undef BIND
11347#undef BLOCK_COMMENT
11348
11349
11350Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
11351  switch (cond) {
11352    // Note some conditions are synonyms for others
11353    case Assembler::zero:         return Assembler::notZero;
11354    case Assembler::notZero:      return Assembler::zero;
11355    case Assembler::less:         return Assembler::greaterEqual;
11356    case Assembler::lessEqual:    return Assembler::greater;
11357    case Assembler::greater:      return Assembler::lessEqual;
11358    case Assembler::greaterEqual: return Assembler::less;
11359    case Assembler::below:        return Assembler::aboveEqual;
11360    case Assembler::belowEqual:   return Assembler::above;
11361    case Assembler::above:        return Assembler::belowEqual;
11362    case Assembler::aboveEqual:   return Assembler::below;
11363    case Assembler::overflow:     return Assembler::noOverflow;
11364    case Assembler::noOverflow:   return Assembler::overflow;
11365    case Assembler::negative:     return Assembler::positive;
11366    case Assembler::positive:     return Assembler::negative;
11367    case Assembler::parity:       return Assembler::noParity;
11368    case Assembler::noParity:     return Assembler::parity;
11369  }
11370  ShouldNotReachHere(); return Assembler::overflow;
11371}
11372
11373SkipIfEqual::SkipIfEqual(
11374    MacroAssembler* masm, const bool* flag_addr, bool value) {
11375  _masm = masm;
11376  _masm->cmp8(ExternalAddress((address)flag_addr), value);
11377  _masm->jcc(Assembler::equal, _label);
11378}
11379
11380SkipIfEqual::~SkipIfEqual() {
11381  _masm->bind(_label);
11382}
11383