s390/vm/copy_s390.hpp

17  * 2 along with this work; if not, write to the Free Software Foundation,
35 // HeapWord* memory ranges are always assumed to be doubleword-aligned,
42 // Furthermore, data access can be forced to be sequential (MVCL and MVCLE)
44 // we use padding byte 0xb0 to prevent the D-cache from being polluted.
54 // *to = *from is transformed into a MVC instruction already with -O1.
56 // to guarantee atomic data accesses.
60 // 1) copy short piece of memory to page-align address(es)
63 //    where len is an int multiple of page size. In that case, up to 4 cache lines are
76 static void copy_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) {
77   if (from > to) {
80       *to++ = *from++;
84     to   += count - 1;
87       *to-- = *from--;
92 static void copy_conjoint_jints_atomic(jint* from, jint* to, size_t count) {
93   if (from > to) {
96       *to++ = *from++;
100     to   += count - 1;
103       *to-- = *from--;
108 static bool has_destructive_overlap(char* from, char* to, size_t byte_count) {
109   return (from < to) && ((to-from) < (ptrdiff_t)byte_count);
116   // and target alignment. Refer to mail comm with Tim Slegel/IBM.
123       "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
126       : [to]       "+Q"  (_to)          /* outputs   */          \
138       "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
141       : [to]       "+Q"  (_to)          /* outputs   */          \
153       "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
156       : [to]       "+Q"  (_to)          /* outputs   */          \
168       "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
171       : [to]       "+Q"  (_to)          /* outputs   */          \
190       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
193       : [to]     "+Q"  (_to)          /* outputs   */            \
204       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
207       : [to]     "+Q"  (_to)          /* outputs   */            \
218       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
221       : [to]     "+Q"  (_to)          /* outputs   */            \
233       "LG      %[addr],%[to]    \n\t" /* address of to area   */ \
235       : [to]     "+Q"  (_to)          /* outputs   */            \
254       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
257       : [to]     "+Q"  (_to)          /* outputs   */            \
268       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
271       : [to]     "+Q"  (_to)          /* outputs   */            \
282       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
285       : [to]     "+Q"  (_to)          /* outputs   */            \
297       "LG      %[addr],%[to]    \n\t" /* address of to area   */ \
299       : [to]     "+Q"  (_to)          /* outputs   */            \
307 #if 0  // Waiting for gcc to support EXRL.
314       "1:  MVC     0(%[len],%[to]),0(%[from]) \n\t"                  \
316       : [to]   "+Q"  (_to)             /* outputs   */               \
327       "3:  MVC     0(%[len],%[to]),0(%[from])  \n\t"                 \
329       : [to]   "+Q"  (_to)             /* outputs   */               \
343       "    LG      %[toaddr],%[to]     \n\t"                         \
350       : [to]       "+Q"  (_to)         /* outputs   */               \
362   #if 0  // code snippet to be used for debugging
384   // This macro needs to be gcc-compiled with -march=z990. Otherwise, the
393       "    LG      %[ta],%[to]     \n\t"      /* address of to area   */     \
394       "    BRC     8,1f            \n\t"      /* nothing to copy   */        \
405       /* Since nby is <= 4096 on entry to this code, we do need */           \
408       "    LA      %[ta],1(%[nby],%[ta]) \n\t"/* adjust to   addr */         \
412       "    BRC     8,1f            \n\t"      /* nothing to copy   */        \
419       "    BC      15,0(%[nby],%[mta]) \n\t"  /* branch to block #ncl  */    \
472       : [to]       "+Q"  (_to)          /* outputs   */          \
486       "    LG      0,%[to]     \n\t"   /* address of to area   */ \
488       "    LGR     1,%[len]    \n\t"   /* len of to area       */ \
492       : [to]   "+Q"  (_to)             /* outputs   */            \
500       "    LG      0,%[to]       \n\t" /* address of to area   */ \
501       "    LGR     1,%[len]      \n\t" /* len of to area       */ \
505       : [to]   "+Q"  (_to)             /* outputs   */            \
512       "    LG      0,%[to]       \n\t" /* address of to area   */ \
513       "    LGR     1,%[len]      \n\t" /* len of to area       */ \
517       : [to]   "+Q"  (_to)             /* outputs   */            \
533     "    LG      %[toaddr],%[to]     \n\t" /* addr of data area */ \
538     : [to]       "+Q"  (_to)         /* outputs   */               \
548   // XC_MEMZERO_256 may be used to clear shorter areas.
551   // - first zeroes a few bytes to align on a HeapWord.
553   //   to have their data aligned on HeapWord boundaries.
554   // - then zeroes a few HeapWords to align on a cache line.
565     "    LG      %[toaddr],%[to]      \n\t" /* addr of data area */   \
592     : [to]       "+Q"  (_to)         /* outputs   */               \
610     "    LG      %[toaddr],%[to]      \n\t" /* addr of data area */   \
647     : [to]       "+Q"  (_to)         /* outputs   */               \
665 static void pd_aligned_disjoint_words(HeapWord* from, HeapWord* to, size_t count) {
675   // 1) copy short piece of memory to page-align address(es)
678   //    where len is an int multiple of page size. In that case, up to 4 cache lines are
683   jbyte* to_bytes   = (jbyte*)to;
690     case 1: MOVE8_ATOMIC_1(to,from)
692     case 2: MOVE8_ATOMIC_2(to,from)
694 //  case 3: MOVE8_ATOMIC_3(to,from)
696 //  case 4: MOVE8_ATOMIC_4(to,from)
700         MVC_MULTI(to,from,len_bytes)
714       *to = *from;
718       *to++ = *from++;
719       *to = *from;
723       *to++ = *from++;
724       *to++ = *from++;
725       *to = *from;
729       *to++ = *from++;
730       *to++ = *from++;
731       *to++ = *from++;
732       *to = *from;
737         *(to++) = *(from++);
743 static void pd_disjoint_words_atomic(HeapWord* from, HeapWord* to, size_t count) {
745   assert(((((size_t)from) & 0x07L) | (((size_t)to) & 0x07L)) == 0, "No atomic copy w/o aligned data");
746   pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate.
749 static void pd_disjoint_words(HeapWord* from, HeapWord* to, size_t count) {
751   pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate.
759 static void pd_aligned_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
764   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
766       case 4: COPY8_ATOMIC_4(to,from)
768       case 3: COPY8_ATOMIC_3(to,from)
770       case 2: COPY8_ATOMIC_2(to,from)
772       case 1: COPY8_ATOMIC_1(to,from)
777         to   += count_in;
779           *(--to) = *(--from); // Copy backwards, areas overlap destructively.
784   jbyte* to_bytes   = (jbyte*)to;
791   if (has_destructive_overlap((char*)from, (char*)to, count*BytesPerLong)) {
798         *to = *from;
803         *to = *from;
804         *(to+1) = t1;
810         *to = *from;
811         *(to+1) = t1;
812         *(to+2) = t2;
819         *to = *from;
820         *(to+1) = t1;
821         *(to+2) = t2;
822         *(to+3) = t3;
827         to   += count;
829           *(--to) = *(--from); // Copy backwards, areas overlap destructively.
835   pd_aligned_disjoint_words(from, to, count);
839 static void pd_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
842   pd_aligned_conjoint_words(from, to, count);
845 static void pd_conjoint_bytes(void* from, void* to, size_t count) {
849   if (has_destructive_overlap((char*)from, (char*)to, count_in))
850     (void)memmove(to, from, count_in);
852     jbyte*  to_bytes   = (jbyte*)to;
858   if (has_destructive_overlap((char*)from, (char*)to, count))
859     (void)memmove(to, from, count);
861     (void)memcpy(to, from, count);
869 static void pd_conjoint_bytes_atomic(void* from, void* to, size_t count) {
870   // Call arraycopy stubs to do the job.
871   pd_conjoint_bytes(from, to, count); // bytes are always accessed atomically.
874 static void pd_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) {
878   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerShort)) {
880     copy_conjoint_jshorts_atomic(from, to, count);
882     jbyte* to_bytes   = (jbyte*)to;
889   copy_conjoint_jshorts_atomic(from, to, count);
893 static void pd_conjoint_jints_atomic(jint* from, jint* to, size_t count) {
897   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerInt)) {
899       case 4: COPY4_ATOMIC_4(to,from)
901       case 3: COPY4_ATOMIC_3(to,from)
903       case 2: COPY4_ATOMIC_2(to,from)
905       case 1: COPY4_ATOMIC_1(to,from)
910         copy_conjoint_jints_atomic(from, to, count_in);
915   jbyte* to_bytes   = (jbyte*)to;
921   copy_conjoint_jints_atomic(from, to, count);
925 static void pd_conjoint_jlongs_atomic(jlong* from, jlong* to, size_t count) {
929   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
931       case 4: COPY8_ATOMIC_4(to,from) return;
932       case 3: COPY8_ATOMIC_3(to,from) return;
933       case 2: COPY8_ATOMIC_2(to,from) return;
934       case 1: COPY8_ATOMIC_1(to,from) return;
938         to   += count_in;
939         while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively.
944   jbyte* to_bytes   = (jbyte*)to;
950   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
953       to   += count_in;
955          *(--to) = *(--from); // Copy backwards, areas overlap destructively.
960     to   += count_in-1;
962       *(to--) = *(from--);
966       *to     = *from;
967       *(to-1) = *(from-1);
968       to     -= 2;
973     pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate.
977 static void pd_conjoint_oops_atomic(oop* from, oop* to, size_t count) {
981   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) {
983       case 4: COPY8_ATOMIC_4(to,from) return;
984       case 3: COPY8_ATOMIC_3(to,from) return;
985       case 2: COPY8_ATOMIC_2(to,from) return;
986       case 1: COPY8_ATOMIC_1(to,from) return;
990         to   += count_in;
991         while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively.
996   jbyte* to_bytes   = (jbyte*)to;
1002   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) {
1004     to   += count_in;
1005     while (count_in-- > 0) *(--to) = *(--from); // Copy backwards, areas overlap destructively.
1009   pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate.
1014 static void pd_arrayof_conjoint_bytes(HeapWord* from, HeapWord* to, size_t count) {
1015   pd_conjoint_bytes_atomic(from, to, count);
1018 static void pd_arrayof_conjoint_jshorts(HeapWord* from, HeapWord* to, size_t count) {
1019   pd_conjoint_jshorts_atomic((jshort*)from, (jshort*)to, count);
1022 static void pd_arrayof_conjoint_jints(HeapWord* from, HeapWord* to, size_t count) {
1023   pd_conjoint_jints_atomic((jint*)from, (jint*)to, count);
1026 static void pd_arrayof_conjoint_jlongs(HeapWord* from, HeapWord* to, size_t count) {
1027   pd_conjoint_jlongs_atomic((jlong*)from, (jlong*)to, count);
1030 static void pd_arrayof_conjoint_oops(HeapWord* from, HeapWord* to, size_t count) {
1031   pd_conjoint_oops_atomic((oop*)from, (oop*)to, count);
1038 static void pd_fill_to_bytes(void* to, size_t count, jubyte value) {
1041   // Initialize storage to a given value. Use memset instead of copy loop.
1043   // 1) init short piece of memory to page-align address
1046   //    where len is an int multiple of page size. In that case, up to 4 cache lines are
1050   // If atomicity is a problem, we have to prevent gcc optimization. Best workaround: inline asm.
1052   jbyte*  to_bytes  = (jbyte*)to;
1061   (void)memset(to, value, count);
1076   julong* to = (julong*) tohw;
1079     *to++ = v;
1092 // Delegate to pd_zero_to_bytes. It also works HeapWord-atomic.
1098 // Delegate to pd_zero_to_bytes. It also works HeapWord-atomic.
1104 static void pd_zero_to_bytes(void* to, size_t count) {
1112   //    a left-to-right direction and may appear to be stored
1114   // Therefore, implementation was changed to use (multiple) XC instructions.
1117   jbyte* to_bytes  = (jbyte*)to;
1130   (void)memset(to, 0, count);