ppc/vm/stubGenerator_ppc.cpp

17  * 2 along with this work; if not, write to the Free Software Foundation,
60   // Call stubs are used to call Java from C
100       // Stack on entry to call_stub:
115       // Save LR/CR to caller's C_FRAME.
121       // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe).
181       // any arguments to copy?
190         // let r_argumentcopy_addr point to last outgoing Java arguments P
193         // let r_argument_addr point to last incoming java argument
227       // Register state on entry to frame manager / native entry:
233       // Tos must point to last argument - element_size.
248       // Set R15_prev_state to 0 for simplifying checks in callee.
250       // Stack on entry to frame manager / native entry:
265       // Pass initial_caller_sp to framemanager.
279       // Now pop frame, process result, and return to caller.
302       // to frame manager / native entry.
347       __ blr(); // return to caller
352       __ blr(); // return to caller
357       __ blr(); // return to caller
362       __ blr(); // return to caller
367       __ blr(); // return to caller
401     // complete return to VM
418   //   LR:     The pc the runtime library callee wants to return to.
497     // Jump to exception handler.
510   // needs all registers to be preserved between the fault point and
520   // it needs to be properly traversed and ignored during GC, so we
550     // whose address will be moved to R11_scratch1.
613   //     to       - register containing starting address
620   void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1,
647           __ std(to,    frame_size - (++slot_nr) * wordSize, R1_SP);
652           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), to, count);
656           __ ld(to,    frame_size - (++slot_nr) * wordSize, R1_SP);
718           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
743   // Support for void zero_words_aligned8(HeapWord* to, size_t count)
746   //   to:
757     Register base_ptr_reg   = R3_ARG1; // tohw (needs to be 8b aligned)
768     int min_dcbz = 2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
770     // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
771     __ dcbtst(base_ptr_reg);                    // Indicate write access to first cache line ...
772     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if number of dwords is even.
779     __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?
783     __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.
786     __ mtctr(tmp1_reg);                         // Set ctr to hit 128byte boundary (0<ctr<cnt).
798     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if rest even
809     //__ dcbtst(base_ptr_reg);                  // Indicate write access to last cache line.
840   // Return address of code to be called from code generated by
860   // to read from the safepoint polling page.
878   // "to" address is assumed to be heapword aligned.
881   //   to:    R3_ARG1
889     const Register to    = R3_ARG1;   // source array address
928         __ andi_(temp, to, 1);
930         __ stb(value, 0, to);
931         __ addi(to, to, 1);
936       __ andi_(temp, to, 2);
938       __ sth(value, 0, to);
939       __ addi(to, to, 2);
945       // Align to 8 bytes, we know we are 4 byte aligned to start.
946       __ andi_(temp, to, 7);
948       __ stw(value, 0, to);
949       __ addi(to, to, 4);
967     __ std(value, 0, to);
968     __ std(value, 8, to);
970     __ std(value, 16, to);
971     __ std(value, 24, to);
973     __ addi(to, to, 32);
987     __ std(value, 0, to);
989     __ addi(to, to, 8);
997     __ stw(value, 0, to);
999       __ addi(to, to, 4);
1004       __ sth(value, 0, to);
1006         __ addi(to, to, 2);
1011         __ stb(value, 0, to);
1027       __ stb(value, 0, to);
1028       __ addi(to, to, 1);
1032       __ stb(value, 0, to);
1033       __ stb(value, 0, to);
1034       __ addi(to, to, 2);
1038       __ stb(value, 0, to);
1039       __ stb(value, 1, to);
1040       __ stb(value, 2, to);
1041       __ stb(value, 3, to);
1050       __ sth(value, 0, to);
1051       __ addi(to, to, 2);
1055       __ sth(value, 0, to);
1056       __ sth(value, 2, to);
1073   //   R4_ARG2    -  to
1088     // Branch to forward copy routine otherwise (within range of 32kB).
1091     // need to copy backwards
1095   // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
1096   // single instructions, but to avoid alignment interrupts (see subsequent
1097   // comment). Furthermore, we try to minimize misaligned access, even
1100   // In Big-Endian mode, the PowerPC architecture requires implementations to
1106   // so every effort should be made to avoid misaligned memory values.
1110   // "from" and "to" addresses are assumed to be heapword aligned.
1114   //      to:    R4_ARG2
1142       // Copy elements if necessary to align to 4 bytes.
1143       __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
1162     __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
1164     // copy a 2-element word if necessary to align to 8 bytes
1203     } else { // Processor supports VSX, so use it to mass copy.
1208       // If supported set DSCR pre-fetch to deepest.
1216       // Backbranch target aligned to 32-byte. Not 16-byte align as
1222       // Use loop with VSX load/store instructions to
1225       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1227       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1285   // "from" and "to" addresses are assumed to be heapword aligned.
1289   //      to:    R4_ARG2
1307     // that we don't have to optimize it.
1325   // "from" and "to" addresses are assumed to be heapword aligned.
1329   //      to:    R4_ARG2
1350   //     1. continue with step 6. if the alignment of from and to mod 4
1352   //     2. align from and to to 4 bytes by copying 1 element if necessary
1353   //     3. at l_2 from and to are 4 byte aligned; continue with
1354   //        5. if they cannot be aligned to 8 bytes because they have
1356   //     4. at this point we know that both, from and to, have the same
1357   //        alignment mod 8, now copy one element if necessary to get
1358   //        8 byte alignment of from and to.
1370   //  According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1405       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1407       // Copy 1 element if necessary to align to 4 bytes.
1418       // At this point the positions of both, from and to, are at least 4 byte aligned.
1421       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
1424       __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
1426       // Copy a 2-element word if necessary to align to 8 bytes.
1470       } else { // Processor supports VSX, so use it to mass copy.
1475         // If supported set DSCR pre-fetch to deepest.
1482         // Backbranch target aligned to 32-byte. It's not aligned 16-byte
1488         // Use loop with VSX load/store instructions to
1491         __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst.
1493         __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1551   // "from" and "to" addresses are assumed to be heapword aligned.
1555   //      to:    R4_ARG2
1590   // is true, the "from" and "to" addresses are assumed to be heapword aligned.
1594   //      to:    R4_ARG2
1617         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1618         __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
1620         // copy 1 element to align to and from on an 8 byte boundary
1660     } else { // Processor supports VSX, so use it to mass copy.
1665       // If supported set DSCR pre-fetch to deepest.
1673       // Backbranch target aligned to 32-byte. Not 16-byte align as
1679       // Use loop with VSX load/store instructions to
1682       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1684       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1719   // "from" and "to" addresses are assumed to be heapword aligned.
1723   //      to:    R4_ARG2
1737   // 32-bit).  If "aligned" is true, the "from" and "to" addresses
1738   // are assumed to be heapword aligned.
1742   //      to:    R4_ARG2
1747     // that we don't have to optimize it.
1772         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1773         __ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
1775         // copy 1 element to align to and from on an 8 byte boundary
1810      } else {  // Processor supports VSX, so use it to mass copy.
1814       // If supported set DSCR pre-fetch to deepest.
1822       // Backbranch target aligned to 32-byte. Not 16-byte align as
1828       // Use loop with VSX load/store instructions to
1834       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1835       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1862   // "from" and "to" addresses are assumed to be heapword aligned.
1866   //      to:    R4_ARG2
1888   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1889   // are assumed to be heapword aligned.
1893   //      to:    R4_ARG2
1932     } else { // Processor supports VSX, so use it to mass copy.
1937       // If supported set DSCR pre-fetch to deepest.
1945       // Backbranch target aligned to 32-byte. Not 16-byte align as
1951       // Use loop with VSX load/store instructions to
1954       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1956       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1990   // "from" and "to" addresses are assumed to be heapword aligned.
1994   //      to:    R4_ARG2
2009   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
2010   // are assumed to be heapword aligned.
2014   //      to:    R4_ARG2
2060      } else { // Processor supports VSX, so use it to mass copy.
2064       // If supported set DSCR pre-fetch to deepest.
2072       // Backbranch target aligned to 32-byte. Not 16-byte align as
2078       // Use loop with VSX load/store instructions to
2084       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
2085       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
2112   // "from" and "to" addresses are assumed to be heapword aligned.
2116   //      to:    R4_ARG2
2137   // "from" and "to" addresses are assumed to be heapword aligned.
2141   //      to:    R4_ARG2
2175   // "from" and "to" addresses are assumed to be heapword aligned.
2179   //      to:    R4_ARG2
2234   //      to:    R4
2272     // Branch to forward copy routine otherwise.
2289     // Empty array: Nothing to do.
2306     __ addi(R8_offset, R8_offset, heapOopSize);   // Step to next offset.
2317                         // Branch to this on success:
2321     // It was a real error; we must depend on the caller to finish the job.
2324     // and report their number to the caller.
2326     __ nand(R3_RET, R5_count, R5_count);   // report (-1^K) to caller
2347   //      to:    R4
2351   // to a long, int, short, or byte copy loop.
2448   //    R3 == -1  -  need to call System.arraycopy
2483     // Assembler stubs will be used for this call to arraycopy
2521     // Load 32-bits signed value. Use br() instruction with it to check icc.
2536     // At this point, it is known to be a typeArray (array_tag 0x3).
2565     // Next registers should be set before the jump to corresponding stub.
2567     const Register to       = R4_ARG2;  // destination array address
2570     // 'from', 'to', 'count' registers should be set in this order
2573     BLOCK_COMMENT("scale indexes to element size");
2577     __ add(to, dst_pos, dst);    // dst_addr
2617     __ add(to, dst_pos, dst);    // dst_addr
2629       // It is safe to examine both src.length and dst.length.
2639       __ add(to, dst_pos, dst);    // dst_addr
2643       assert_different_registers(from, to, count, sco_temp,
2683     Register to             = R4_ARG2;  // destination array address
2716     // load unaligned from[0-15] to vsRet
2726     // to load keys
2734     // load the 1st round key to vKey1
2744     // load the 2nd round key to vKey1
2749     // load the 3rd round key to vKey2
2754     // load the 4th round key to vKey3
2759     // load the 5th round key to vKey4
2770     // load the 6th round key to vKey1
2775     // load the 7th round key to vKey2
2780     // load the 8th round key to vKey3
2785     // load the 9th round key to vKey4
2796     // load the 10th round key to vKey1
2801     // load the 11th round key to vKey2
2814     // load the 12th round key to vKey1
2819     // load the 13th round key to vKey2
2832     // load the 14th round key to vKey1
2837     // load the 15th round key to vKey2
2848     __ neg             (temp, to);
2854     __ lvx             (vTmp1, to);
2857     __ lvx             (vTmp4, fifteen, to);
2858     __ stvx            (vTmp1, to);
2860     __ stvx            (vRet, fifteen, to);
2882     Register to             = R4_ARG2;  // destination array address
2916     // load unaligned from[0-15] to vsRet
2926     // to load keys
2940     // load the 15th round key to vKey11
2947     // load the 14th round key to vKey10
2952     // load the 13th round key to vKey10
2957     // load the 12th round key to vKey10
2962     // load the 11th round key to vKey10
2978     // load the 13th round key to vKey11
2985     // load the 12th round key to vKey10
2990     // load the 11th round key to vKey10
3004     // load the 11th round key to vKey11
3016     // load the 10th round key to vKey10
3021     // load the 9th round key to vKey10
3026     // load the 8th round key to vKey10
3031     // load the 7th round key to vKey10
3036     // load the 6th round key to vKey10
3048     // load the 5th round key to vKey10
3053     // load the 4th round key to vKey10
3058     // load the 3rd round key to vKey10
3063     // load the 2nd round key to vKey10
3068     // load the 1st round key to vKey10
3080     __ neg             (temp, to);
3086     __ lvx             (vTmp1, to);
3089     __ lvx             (vTmp4, fifteen, to);
3090     __ stvx            (vTmp1, to);
3092     __ stvx            (vRet, fifteen, to);
3245     // C2 does not respect int to long conversion for stub calls.
3275     __ blr();  // Return to caller.
3284       // arguments to kernel_crc32:
3287       const Register dataLen = R5_ARG3;  // #bytes to process
3333     // arguments to kernel_crc32:
3336     const Register dataLen = R5_ARG3;  // #bytes to process
3397     // arguments to kernel_crc32:
3400     const Register dataLen = R5_ARG3;  // #bytes to process
3444     // benefit seems to be smaller than the disadvantage of having a
3476     // These entry points require SharedInfo::stack0 to be set up in