1/*
2 * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2016 SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26// Major contributions by LS
27
28#ifndef CPU_S390_VM_COPY_S390_HPP
29#define CPU_S390_VM_COPY_S390_HPP
30
31// Inline functions for memory copy and fill.
32
33// HeapWordSize (the size of class HeapWord) is 8 Bytes (the size of a
34// pointer variable), since we always run the _LP64 model. As a consequence,
35// HeapWord* memory ranges are always assumed to be doubleword-aligned,
36// having a size which is an integer multiple of HeapWordSize.
37//
38// Dealing only with doubleword-aligned doubleword units has important
39// positive performance and data access consequences. Many of the move
40// instructions perform particularly well under these circumstances.
41// Data access is "doubleword-concurrent", except for MVC and XC.
42// Furthermore, data access can be forced to be sequential (MVCL and MVCLE)
43// by use of the special padding byte 0xb1, where required. For copying,
44// we use padding byte 0xb0 to prevent the D-cache from being polluted.
45//
46// On z/Architecture, gcc optimizes memcpy into a series of MVC instructions.
47// This is optimal, even if just one HeapWord is copied. However, MVC
48// copying is not atomic, i.e. not "doubleword concurrent" by definition.
49//
50// If the -mmvcle compiler option is specified, memcpy translates into
51// code such that the entire memory range is copied or preset with just
52// one MVCLE instruction.
53//
54// *to = *from is transformed into a MVC instruction already with -O1.
55// Thus, for atomic copy operations, (inline) assembler code is required
56// to guarantee atomic data accesses.
57//
58// For large (len >= MVCLEThreshold) chunks of memory, we exploit
59// special H/W support of z/Architecture:
60// 1) copy short piece of memory to page-align address(es)
61// 2) copy largest part (all contained full pages) of memory using mvcle instruction.
62//    z/Architecture processors have special H/W support for page-aligned storage
63//    where len is an int multiple of page size. In that case, up to 4 cache lines are
64//    processed in parallel and L1 cache is not polluted.
65// 3) copy the remaining piece of memory.
66//
67//  Measurement classifications:
68//  very rare - <=     10.000 calls AND <=     1.000 usec elapsed
69//       rare - <=    100.000 calls AND <=    10.000 usec elapsed
70//       some - <=  1.000.000 calls AND <=   100.000 usec elapsed
71//       freq - <= 10.000.000 calls AND <= 1.000.000 usec elapsed
72//  very freq - >  10.000.000 calls OR  >  1.000.000 usec elapsed
73
74#undef USE_INLINE_ASM
75
76static void copy_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) {
77  if (from > to) {
78    while (count-- > 0) {
79      // Copy forwards
80      *to++ = *from++;
81    }
82  } else {
83    from += count - 1;
84    to   += count - 1;
85    while (count-- > 0) {
86      // Copy backwards
87      *to-- = *from--;
88    }
89  }
90}
91
92static void copy_conjoint_jints_atomic(jint* from, jint* to, size_t count) {
93  if (from > to) {
94    while (count-- > 0) {
95      // Copy forwards
96      *to++ = *from++;
97    }
98  } else {
99    from += count - 1;
100    to   += count - 1;
101    while (count-- > 0) {
102      // Copy backwards
103      *to-- = *from--;
104    }
105  }
106}
107
108static bool has_destructive_overlap(char* from, char* to, size_t byte_count) {
109  return (from < to) && ((to-from) < (ptrdiff_t)byte_count);
110}
111
112#ifdef USE_INLINE_ASM
113
114  //--------------------------------------------------------------
115  // Atomic copying. Atomicity is given by the minimum of source
116  // and target alignment. Refer to mail comm with Tim Slegel/IBM.
117  // Only usable for disjoint source and target.
118  //--------------------------------------------------------------
119  #define MOVE8_ATOMIC_4(_to,_from) {                            \
120    unsigned long toaddr;                                        \
121    unsigned long fromaddr;                                      \
122    asm(                                                         \
123      "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
124      "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
125      "MVC     0(32,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
126      : [to]       "+Q"  (_to)          /* outputs   */          \
127      , [from]     "+Q"  (_from)                                 \
128      , [toaddr]   "=a"  (toaddr)                                \
129      , [fromaddr] "=a"  (fromaddr)                              \
130      :                                                          \
131      : "cc"                            /* clobbered */          \
132    );                                                           \
133  }
134  #define MOVE8_ATOMIC_3(_to,_from) {                            \
135    unsigned long toaddr;                                        \
136    unsigned long fromaddr;                                      \
137    asm(                                                         \
138      "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
139      "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
140      "MVC     0(24,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
141      : [to]       "+Q"  (_to)          /* outputs   */          \
142      , [from]     "+Q"  (_from)                                 \
143      , [toaddr]   "=a"  (toaddr)                                \
144      , [fromaddr] "=a"  (fromaddr)                              \
145      :                                                          \
146      : "cc"                            /* clobbered */          \
147    );                                                           \
148  }
149  #define MOVE8_ATOMIC_2(_to,_from) {                            \
150    unsigned long toaddr;                                        \
151    unsigned long fromaddr;                                      \
152    asm(                                                         \
153      "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
154      "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
155      "MVC     0(16,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
156      : [to]       "+Q"  (_to)          /* outputs   */          \
157      , [from]     "+Q"  (_from)                                 \
158      , [toaddr]   "=a"  (toaddr)                                \
159      , [fromaddr] "=a"  (fromaddr)                              \
160      :                                                          \
161      : "cc"                            /* clobbered */          \
162    );                                                           \
163  }
164  #define MOVE8_ATOMIC_1(_to,_from) {                            \
165    unsigned long toaddr;                                        \
166    unsigned long fromaddr;                                      \
167    asm(                                                         \
168      "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
169      "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
170      "MVC     0(8,%[toaddr]),0(%[fromaddr]) \n\t"  /* move data */ \
171      : [to]       "+Q"  (_to)          /* outputs   */          \
172      , [from]     "+Q"  (_from)                                 \
173      , [toaddr]   "=a"  (toaddr)                                \
174      , [fromaddr] "=a"  (fromaddr)                              \
175      :                                                          \
176      : "cc"                            /* clobbered */          \
177    );                                                           \
178  }
179
180  //--------------------------------------------------------------
181  // Atomic copying of 8-byte entities.
182  // Conjoint/disjoint property does not matter. Entities are first
183  // loaded and then stored.
184  // _to and _from must be 8-byte aligned.
185  //--------------------------------------------------------------
186  #define COPY8_ATOMIC_4(_to,_from) {                            \
187    unsigned long toaddr;                                        \
188    asm(                                                         \
189      "LG      3,%[from]        \n\t" /* address of from area */ \
190      "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
191      "LMG     0,3,0(3)         \n\t" /* load data            */ \
192      "STMG    0,3,0(%[toaddr]) \n\t" /* store data           */ \
193      : [to]     "+Q"  (_to)          /* outputs   */            \
194      , [from]   "+Q"  (_from)        /* outputs   */            \
195      , [toaddr] "=a"  (toaddr)       /* inputs    */            \
196      :                                                          \
197      : "cc",  "r0", "r1", "r2", "r3" /* clobbered */            \
198    );                                                           \
199  }
200  #define COPY8_ATOMIC_3(_to,_from) {                            \
201    unsigned long toaddr;                                        \
202    asm(                                                         \
203      "LG      2,%[from]        \n\t" /* address of from area */ \
204      "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
205      "LMG     0,2,0(2)         \n\t" /* load data            */ \
206      "STMG    0,2,0(%[toaddr]) \n\t" /* store data           */ \
207      : [to]     "+Q"  (_to)          /* outputs   */            \
208      , [from]   "+Q"  (_from)        /* outputs   */            \
209      , [toaddr] "=a"  (toaddr)       /* inputs    */            \
210      :                                                          \
211      : "cc",  "r0", "r1", "r2"       /* clobbered */            \
212    );                                                           \
213  }
214  #define COPY8_ATOMIC_2(_to,_from) {                            \
215    unsigned long toaddr;                                        \
216    asm(                                                         \
217      "LG      1,%[from]        \n\t" /* address of from area */ \
218      "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
219      "LMG     0,1,0(1)         \n\t" /* load data            */ \
220      "STMG    0,1,0(%[toaddr]) \n\t" /* store data           */ \
221      : [to]     "+Q"  (_to)          /* outputs   */            \
222      , [from]   "+Q"  (_from)        /* outputs   */            \
223      , [toaddr] "=a"  (toaddr)       /* inputs    */            \
224      :                                                          \
225      : "cc",  "r0", "r1"             /* clobbered */            \
226    );                                                           \
227  }
228  #define COPY8_ATOMIC_1(_to,_from) {                            \
229    unsigned long addr;                                          \
230    asm(                                                         \
231      "LG      %[addr],%[from]  \n\t" /* address of from area */ \
232      "LG      0,0(0,%[addr])   \n\t" /* load data            */ \
233      "LG      %[addr],%[to]    \n\t" /* address of to area   */ \
234      "STG     0,0(0,%[addr])   \n\t" /* store data           */ \
235      : [to]     "+Q"  (_to)          /* outputs   */            \
236      , [from]   "+Q"  (_from)        /* outputs   */            \
237      , [addr]   "=a"  (addr)         /* inputs    */            \
238      :                                                          \
239      : "cc",  "r0"                   /* clobbered */            \
240    );                                                           \
241  }
242
243  //--------------------------------------------------------------
244  // Atomic copying of 4-byte entities.
245  // Exactly 4 (four) entities are copied.
246  // Conjoint/disjoint property does not matter. Entities are first
247  // loaded and then stored.
248  // _to and _from must be 4-byte aligned.
249  //--------------------------------------------------------------
250  #define COPY4_ATOMIC_4(_to,_from) {                            \
251    unsigned long toaddr;                                        \
252    asm(                                                         \
253      "LG      3,%[from]        \n\t" /* address of from area */ \
254      "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
255      "LM      0,3,0(3)         \n\t" /* load data            */ \
256      "STM     0,3,0(%[toaddr]) \n\t" /* store data           */ \
257      : [to]     "+Q"  (_to)          /* outputs   */            \
258      , [from]   "+Q"  (_from)        /* outputs   */            \
259      , [toaddr] "=a"  (toaddr)       /* inputs    */            \
260      :                                                          \
261      : "cc",  "r0", "r1", "r2", "r3" /* clobbered */            \
262    );                                                           \
263  }
264  #define COPY4_ATOMIC_3(_to,_from) {                            \
265    unsigned long toaddr;                                        \
266    asm(                                                         \
267      "LG      2,%[from]        \n\t" /* address of from area */ \
268      "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
269      "LM      0,2,0(2)         \n\t" /* load data            */ \
270      "STM     0,2,0(%[toaddr]) \n\t" /* store data           */ \
271      : [to]     "+Q"  (_to)          /* outputs   */            \
272      , [from]   "+Q"  (_from)        /* outputs   */            \
273      , [toaddr] "=a"  (toaddr)       /* inputs    */            \
274      :                                                          \
275      : "cc",  "r0", "r1", "r2"       /* clobbered */            \
276    );                                                           \
277  }
278  #define COPY4_ATOMIC_2(_to,_from) {                            \
279    unsigned long toaddr;                                        \
280    asm(                                                         \
281      "LG      1,%[from]        \n\t" /* address of from area */ \
282      "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
283      "LM      0,1,0(1)         \n\t" /* load data            */ \
284      "STM     0,1,0(%[toaddr]) \n\t" /* store data           */ \
285      : [to]     "+Q"  (_to)          /* outputs   */            \
286      , [from]   "+Q"  (_from)        /* outputs   */            \
287      , [toaddr] "=a"  (toaddr)       /* inputs    */            \
288      :                                                          \
289      : "cc",  "r0", "r1"             /* clobbered */            \
290    );                                                           \
291  }
292  #define COPY4_ATOMIC_1(_to,_from) {                            \
293    unsigned long addr;                                          \
294    asm(                                                         \
295      "LG      %[addr],%[from]  \n\t" /* address of from area */ \
296      "L       0,0(0,%[addr])   \n\t" /* load data            */ \
297      "LG      %[addr],%[to]    \n\t" /* address of to area   */ \
298      "ST      0,0(0,%[addr])   \n\t" /* store data           */ \
299      : [to]     "+Q"  (_to)          /* outputs   */            \
300      , [from]   "+Q"  (_from)        /* outputs   */            \
301      , [addr]   "=a"  (addr)         /* inputs    */            \
302      :                                                          \
303      : "cc",  "r0"                   /* clobbered */            \
304    );                                                           \
305  }
306
307#if 0  // Waiting for gcc to support EXRL.
308  #define MVC_MEMCOPY(_to,_from,_len)                                \
309    if (VM_Version::has_ExecuteExtensions()) {                       \
310      asm("\t"                                                       \
311      "    LAY     1,-1(0,%[len])      \n\t" /* decr for MVC  */     \
312      "    EXRL    1,1f                \n\t" /* execute MVC instr */ \
313      "    BRC     15,2f               \n\t" /* skip template */     \
314      "1:  MVC     0(%[len],%[to]),0(%[from]) \n\t"                  \
315      "2:  BCR     0,0                 \n\t"                         \
316      : [to]   "+Q"  (_to)             /* outputs   */               \
317      , [from] "+Q"  (_from)           /* outputs   */               \
318      : [len]  "r"   (_len)            /* inputs    */               \
319      : "cc",  "r1"                    /* clobbered */               \
320      );                                                             \
321    } else {                                                         \
322      asm("\t"                                                       \
323      "    LARL    2,3f                \n\t"                         \
324      "    LAY     1,-1(0,%[len])      \n\t" /* decr for MVC  */     \
325      "    EX      1,0(2)              \n\t" /* execute MVC instr */ \
326      "    BRC     15,4f               \n\t" /* skip template */     \
327      "3:  MVC     0(%[len],%[to]),0(%[from])  \n\t"                 \
328      "4:  BCR     0,0                 \n\t"                         \
329      : [to]   "+Q"  (_to)             /* outputs   */               \
330      , [from] "+Q"  (_from)           /* outputs   */               \
331      : [len]  "r"   (_len)            /* inputs    */               \
332      : "cc",  "r1", "r2"              /* clobbered */               \
333      );                                                             \
334    }
335#else
336  #define MVC_MEMCOPY(_to,_from,_len)                                \
337  { unsigned long toaddr;   unsigned long tolen;                     \
338    unsigned long fromaddr; unsigned long target;                    \
339      asm("\t"                                                       \
340      "    LTGR    %[tolen],%[len]     \n\t" /* decr for MVC  */     \
341      "    BRC     8,2f                \n\t" /* do nothing for l=0*/ \
342      "    AGHI    %[tolen],-1         \n\t"                         \
343      "    LG      %[toaddr],%[to]     \n\t"                         \
344      "    LG      %[fromaddr],%[from] \n\t"                         \
345      "    LARL    %[target],1f        \n\t" /* addr of MVC instr */ \
346      "    EX      %[tolen],0(%[target])         \n\t" /* execute MVC instr */ \
347      "    BRC     15,2f                         \n\t" /* skip template */     \
348      "1:  MVC     0(1,%[toaddr]),0(%[fromaddr]) \n\t"                         \
349      "2:  BCR     0,0                 \n\t" /* nop a branch target*/\
350      : [to]       "+Q"  (_to)         /* outputs   */               \
351      , [from]     "+Q"  (_from)                                     \
352      , [tolen]    "=a"  (tolen)                                     \
353      , [toaddr]   "=a"  (toaddr)                                    \
354      , [fromaddr] "=a"  (fromaddr)                                  \
355      , [target]   "=a"  (target)                                    \
356      : [len]       "r"  (_len)        /* inputs    */               \
357      : "cc"                           /* clobbered */               \
358      );                                                             \
359  }
360#endif
361
362  #if 0  // code snippet to be used for debugging
363      /* ASSERT code BEGIN */                                                \
364      "    LARL    %[len],5f       \n\t"                                     \
365      "    LARL    %[mta],4f       \n\t"                                     \
366      "    SLGR    %[len],%[mta]   \n\t"                                     \
367      "    CGHI    %[len],16       \n\t"                                     \
368      "    BRC     7,9f            \n\t"      /* block size !=  16 */        \
369                                                                             \
370      "    LARL    %[len],1f       \n\t"                                     \
371      "    SLGR    %[len],%[mta]   \n\t"                                     \
372      "    CGHI    %[len],256      \n\t"                                     \
373      "    BRC     7,9f            \n\t"      /* list len   != 256 */        \
374                                                                             \
375      "    LGR     0,0             \n\t"      /* artificial SIGILL */        \
376      "9:  BRC     7,-2            \n\t"                                     \
377      "    LARL    %[mta],1f       \n\t"      /* restore MVC table begin */  \
378      /* ASSERT code END   */
379  #endif
380
381  // Optimized copying for data less than 4k
382  // - no destructive overlap
383  // - 0 <= _n_bytes <= 4096
384  // This macro needs to be gcc-compiled with -march=z990. Otherwise, the
385  // LAY instruction is not available.
386  #define MVC_MULTI(_to,_from,_n_bytes)                                      \
387  { unsigned long toaddr;                                                    \
388    unsigned long fromaddr;                                                  \
389    unsigned long movetable;                                                 \
390    unsigned long len;                                                       \
391      asm("\t"                                                               \
392      "    LTGFR   %[len],%[nby]   \n\t"                                     \
393      "    LG      %[ta],%[to]     \n\t"      /* address of to area   */     \
394      "    BRC     8,1f            \n\t"      /* nothing to copy   */        \
395                                                                             \
396      "    NILL    %[nby],255      \n\t"      /* # bytes mod 256      */     \
397      "    LG      %[fa],%[from]   \n\t"      /* address of from area */     \
398      "    BRC     8,3f            \n\t"      /* no rest, skip copying */    \
399                                                                             \
400      "    LARL    %[mta],2f       \n\t"      /* MVC template addr */        \
401      "    AHI     %[nby],-1       \n\t"      /* adjust for EX MVC  */       \
402                                                                             \
403      "    EX      %[nby],0(%[mta]) \n\t"     /* only rightmost */           \
404                                              /* 8 bits of nby used */       \
405      /* Since nby is <= 4096 on entry to this code, we do need */           \
406      /* no zero extension before using it in addr calc.        */           \
407      "    LA      %[fa],1(%[nby],%[fa]) \n\t"/* adjust from addr */         \
408      "    LA      %[ta],1(%[nby],%[ta]) \n\t"/* adjust to   addr */         \
409                                                                             \
410      "3:  SRAG    %[nby],%[len],8 \n\t"      /* # cache lines     */        \
411      "    LARL    %[mta],1f       \n\t"      /* MVC table begin   */        \
412      "    BRC     8,1f            \n\t"      /* nothing to copy   */        \
413                                                                             \
414      /* Insert ASSERT code here if required. */                             \
415                                                                             \
416                                                                             \
417      "    LNGFR   %[nby],%[nby]   \n\t"      /* negative offset into     */ \
418      "    SLLG    %[nby],%[nby],4 \n\t"      /* MVC table 16-byte blocks */ \
419      "    BC      15,0(%[nby],%[mta]) \n\t"  /* branch to block #ncl  */    \
420                                                                             \
421      "2:  MVC     0(1,%[ta]),0(%[fa]) \n\t"  /* MVC template */             \
422                                                                             \
423      "4:  MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 4096 == l        */      \
424      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
425      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
426      "5:  MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3840 <= l < 4096 */      \
427      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
428      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
429      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3548 <= l < 3328 */      \
430      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
431      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
432      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3328 <= l < 3328 */      \
433      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
434      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
435      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3072 <= l < 3328 */      \
436      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
437      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
438      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2816 <= l < 3072 */      \
439      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
440      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
441      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2560 <= l < 2816 */      \
442      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
443      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
444      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2304 <= l < 2560 */      \
445      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
446      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
447      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2048 <= l < 2304 */      \
448      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
449      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
450      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1792 <= l < 2048 */      \
451      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
452      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
453      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1536 <= l < 1792 */      \
454      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
455      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
456      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1280 <= l < 1536 */      \
457      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
458      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
459      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1024 <= l < 1280 */      \
460      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
461      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
462      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /*  768 <= l < 1024 */      \
463      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
464      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
465      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /*  512 <= l <  768 */      \
466      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
467      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
468      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /*  256 <= l <  512 */      \
469      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
470      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
471      "1:  BCR     0,0                     \n\t" /* nop as branch target */  \
472      : [to]       "+Q"  (_to)          /* outputs   */          \
473      , [from]     "+Q"  (_from)                                 \
474      , [ta]       "=a"  (toaddr)                                \
475      , [fa]       "=a"  (fromaddr)                              \
476      , [mta]      "=a"  (movetable)                             \
477      , [nby]      "+a"  (_n_bytes)                              \
478      , [len]      "=a"  (len)                                   \
479      :                                                          \
480      : "cc"                            /* clobbered */          \
481    );                                                           \
482  }
483
484  #define MVCLE_MEMCOPY(_to,_from,_len)                           \
485    asm(                                                          \
486      "    LG      0,%[to]     \n\t"   /* address of to area   */ \
487      "    LG      2,%[from]   \n\t"   /* address of from area */ \
488      "    LGR     1,%[len]    \n\t"   /* len of to area       */ \
489      "    LGR     3,%[len]    \n\t"   /* len of from area     */ \
490      "1:  MVCLE   0,2,176     \n\t"   /* copy storage, bypass cache (0xb0) */ \
491      "    BRC     1,1b        \n\t"   /* retry if interrupted */ \
492      : [to]   "+Q"  (_to)             /* outputs   */            \
493      , [from] "+Q"  (_from)           /* outputs   */            \
494      : [len]  "r"   (_len)            /* inputs    */            \
495      : "cc",  "r0", "r1", "r2", "r3"  /* clobbered */            \
496    );
497
498  #define MVCLE_MEMINIT(_to,_val,_len)                            \
499    asm(                                                          \
500      "    LG      0,%[to]       \n\t" /* address of to area   */ \
501      "    LGR     1,%[len]      \n\t" /* len of to area       */ \
502      "    XGR     3,3           \n\t" /* from area len = 0    */ \
503      "1:  MVCLE   0,2,0(%[val]) \n\t" /* init storage         */ \
504      "    BRC     1,1b          \n\t" /* retry if interrupted */ \
505      : [to]   "+Q"  (_to)             /* outputs   */            \
506      : [len]  "r"   (_len)            /* inputs    */            \
507      , [val]  "r"   (_val)            /* inputs    */            \
508      : "cc",  "r0", "r1", "r3"        /* clobbered */            \
509    );
510  #define MVCLE_MEMZERO(_to,_len)                                 \
511    asm(                                                          \
512      "    LG      0,%[to]       \n\t" /* address of to area   */ \
513      "    LGR     1,%[len]      \n\t" /* len of to area       */ \
514      "    XGR     3,3           \n\t" /* from area len = 0    */ \
515      "1:  MVCLE   0,2,0         \n\t" /* clear storage        */ \
516      "    BRC     1,1b          \n\t" /* retry if interrupted */ \
517      : [to]   "+Q"  (_to)             /* outputs   */            \
518      : [len]  "r"   (_len)            /* inputs    */            \
519      : "cc",  "r0", "r1", "r3"        /* clobbered */            \
520    );
521
522  // Clear a stretch of memory, 0 <= _len <= 256.
523  // There is no alignment prereq.
524  // There is no test for len out of range specified above.
525  #define XC_MEMZERO_256(_to,_len)                                 \
526{ unsigned long toaddr;   unsigned long tolen;                     \
527  unsigned long target;                                            \
528    asm("\t"                                                       \
529    "    LTGR    %[tolen],%[len]     \n\t" /* decr for MVC  */     \
530    "    BRC     8,2f                \n\t" /* do nothing for l=0*/ \
531    "    AGHI    %[tolen],-1         \n\t" /* adjust for EX XC  */ \
532    "    LARL    %[target],1f        \n\t" /* addr of XC instr  */ \
533    "    LG      %[toaddr],%[to]     \n\t" /* addr of data area */ \
534    "    EX      %[tolen],0(%[target])       \n\t" /* execute MVC instr */ \
535    "    BRC     15,2f                       \n\t" /* skip template */     \
536    "1:  XC      0(1,%[toaddr]),0(%[toaddr]) \n\t"                         \
537    "2:  BCR     0,0                 \n\t" /* nop a branch target*/\
538    : [to]       "+Q"  (_to)         /* outputs   */               \
539    , [tolen]    "=a"  (tolen)                                     \
540    , [toaddr]   "=a"  (toaddr)                                    \
541    , [target]   "=a"  (target)                                    \
542    : [len]       "r"  (_len)        /* inputs    */               \
543    : "cc"                           /* clobbered */               \
544    );                                                             \
545}
546
547  // Clear a stretch of memory, 256 < _len.
548  // XC_MEMZERO_256 may be used to clear shorter areas.
549  //
550  // The code
551  // - first zeroes a few bytes to align on a HeapWord.
552  //   This step is currently inactive because all calls seem
553  //   to have their data aligned on HeapWord boundaries.
554  // - then zeroes a few HeapWords to align on a cache line.
555  // - then zeroes entire cache lines in a loop.
556  // - then zeroes the remaining (partial) cache line.
557#if 1
558  #define XC_MEMZERO_ANY(_to,_len)                                    \
559{ unsigned long toaddr;   unsigned long tolen;                        \
560  unsigned long len8;     unsigned long len256;                       \
561  unsigned long target;   unsigned long lenx;                         \
562    asm("\t"                                                          \
563    "    LTGR    %[tolen],%[len]      \n\t" /*                   */   \
564    "    BRC     8,2f                 \n\t" /* do nothing for l=0*/   \
565    "    LG      %[toaddr],%[to]      \n\t" /* addr of data area */   \
566    "    LARL    %[target],1f         \n\t" /* addr of XC instr  */   \
567    " "                                                               \
568    "    LCGR    %[len256],%[toaddr]  \n\t" /* cache line alignment */\
569    "    NILL    %[len256],0xff       \n\t"                           \
570    "    BRC     8,4f                 \n\t" /* already aligned     */ \
571    "    NILH    %[len256],0x00       \n\t" /* zero extend         */ \
572    "    LLGFR   %[len256],%[len256]  \n\t"                           \
573    "    LAY     %[lenx],-1(,%[len256]) \n\t"                         \
574    "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
575    "    LA      %[toaddr],0(%[len256],%[toaddr]) \n\t"               \
576    "    SGR     %[tolen],%[len256]   \n\t" /* adjust len          */ \
577    " "                                                               \
578    "4:  SRAG    %[lenx],%[tolen],8   \n\t" /* # cache lines       */ \
579    "    BRC     8,6f                 \n\t" /* no full cache lines */ \
580    "5:  XC      0(256,%[toaddr]),0(%[toaddr]) \n\t"                  \
581    "    LA      %[toaddr],256(,%[toaddr]) \n\t"                      \
582    "    BRCTG   %[lenx],5b           \n\t" /* iterate             */ \
583    " "                                                               \
584    "6:  NILL    %[tolen],0xff        \n\t" /* leftover bytes      */ \
585    "    BRC     8,2f                 \n\t" /* done if none        */ \
586    "    LAY     %[lenx],-1(,%[tolen]) \n\t"                          \
587    "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
588    "    BRC     15,2f                \n\t" /* skip template       */ \
589    " "                                                               \
590    "1:  XC      0(1,%[toaddr]),0(%[toaddr]) \n\t"                    \
591    "2:  BCR     0,0                  \n\t" /* nop a branch target */ \
592    : [to]       "+Q"  (_to)         /* outputs   */               \
593    , [lenx]     "=a"  (lenx)                                      \
594    , [len256]   "=a"  (len256)                                    \
595    , [tolen]    "=a"  (tolen)                                     \
596    , [toaddr]   "=a"  (toaddr)                                    \
597    , [target]   "=a"  (target)                                    \
598    : [len]       "r"  (_len)        /* inputs    */               \
599    : "cc"                           /* clobbered */               \
600    );                                                             \
601}
602#else
603  #define XC_MEMZERO_ANY(_to,_len)                                    \
604{ unsigned long toaddr;   unsigned long tolen;                        \
605  unsigned long len8;     unsigned long len256;                       \
606  unsigned long target;   unsigned long lenx;                         \
607    asm("\t"                                                          \
608    "    LTGR    %[tolen],%[len]      \n\t" /*                   */   \
609    "    BRC     8,2f                 \n\t" /* do nothing for l=0*/   \
610    "    LG      %[toaddr],%[to]      \n\t" /* addr of data area */   \
611    "    LARL    %[target],1f         \n\t" /* addr of XC instr  */   \
612    " "                                                               \
613    "    LCGR    %[len8],%[toaddr]    \n\t" /* HeapWord alignment  */ \
614    "    NILL    %[len8],0x07         \n\t"                           \
615    "    BRC     8,3f                 \n\t" /* already aligned     */ \
616    "    NILH    %[len8],0x00         \n\t" /* zero extend         */ \
617    "    LLGFR   %[len8],%[len8]      \n\t"                           \
618    "    LAY     %[lenx],-1(,%[len8]) \n\t"                           \
619    "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr */   \
620    "    LA      %[toaddr],0(%[len8],%[toaddr]) \n\t"                 \
621    "    SGR     %[tolen],%[len8]     \n\t" /* adjust len          */ \
622    " "                                                               \
623    "3:  LCGR    %[len256],%[toaddr]  \n\t" /* cache line alignment */\
624    "    NILL    %[len256],0xff       \n\t"                           \
625    "    BRC     8,4f                 \n\t" /* already aligned     */ \
626    "    NILH    %[len256],0x00       \n\t" /* zero extend         */ \
627    "    LLGFR   %[len256],%[len256]  \n\t"                           \
628    "    LAY     %[lenx],-1(,%[len256]) \n\t"                         \
629    "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
630    "    LA      %[toaddr],0(%[len256],%[toaddr]) \n\t"               \
631    "    SGR     %[tolen],%[len256]   \n\t" /* adjust len          */ \
632    " "                                                               \
633    "4:  SRAG    %[lenx],%[tolen],8   \n\t" /* # cache lines       */ \
634    "    BRC     8,6f                 \n\t" /* no full cache lines */ \
635    "5:  XC      0(256,%[toaddr]),0(%[toaddr]) \n\t"                  \
636    "    LA      %[toaddr],256(,%[toaddr]) \n\t"                      \
637    "    BRCTG   %[lenx],5b           \n\t" /* iterate             */ \
638    " "                                                               \
639    "6:  NILL    %[tolen],0xff        \n\t" /* leftover bytes      */ \
640    "    BRC     8,2f                 \n\t" /* done if none        */ \
641    "    LAY     %[lenx],-1(,%[tolen]) \n\t"                          \
642    "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
643    "    BRC     15,2f                \n\t" /* skip template       */ \
644    " "                                                               \
645    "1:  XC      0(1,%[toaddr]),0(%[toaddr]) \n\t"                    \
646    "2:  BCR     0,0                  \n\t" /* nop a branch target */ \
647    : [to]       "+Q"  (_to)         /* outputs   */               \
648    , [lenx]     "=a"  (lenx)                                      \
649    , [len8]     "=a"  (len8)                                      \
650    , [len256]   "=a"  (len256)                                    \
651    , [tolen]    "=a"  (tolen)                                     \
652    , [toaddr]   "=a"  (toaddr)                                    \
653    , [target]   "=a"  (target)                                    \
654    : [len]       "r"  (_len)        /* inputs    */               \
655    : "cc"                           /* clobbered */               \
656    );                                                             \
657}
658#endif
659#endif // USE_INLINE_ASM
660
661//*************************************//
662//   D I S J O I N T   C O P Y I N G   //
663//*************************************//
664
665static void pd_aligned_disjoint_words(HeapWord* from, HeapWord* to, size_t count) {
666  // JVM2008: very frequent, some tests frequent.
667
668  // Copy HeapWord (=DW) aligned storage. Use MVCLE in inline-asm code.
669  // MVCLE guarantees DW concurrent (i.e. atomic) accesses if both the addresses of the operands
670  // are DW aligned and the length is an integer multiple of a DW. Should always be true here.
671  //
672  // No special exploit needed. H/W discovers suitable situations itself.
673  //
674  // For large chunks of memory, exploit special H/W support of z/Architecture:
675  // 1) copy short piece of memory to page-align address(es)
676  // 2) copy largest part (all contained full pages) of memory using mvcle instruction.
677  //    z/Architecture processors have special H/W support for page-aligned storage
678  //    where len is an int multiple of page size. In that case, up to 4 cache lines are
679  //    processed in parallel and L1 cache is not polluted.
680  // 3) copy the remaining piece of memory.
681  //
682#ifdef USE_INLINE_ASM
683  jbyte* to_bytes   = (jbyte*)to;
684  jbyte* from_bytes = (jbyte*)from;
685  size_t len_bytes  = count*HeapWordSize;
686
687  // Optimized copying for data less than 4k
688  switch (count) {
689    case 0: return;
690    case 1: MOVE8_ATOMIC_1(to,from)
691            return;
692    case 2: MOVE8_ATOMIC_2(to,from)
693            return;
694//  case 3: MOVE8_ATOMIC_3(to,from)
695//          return;
696//  case 4: MOVE8_ATOMIC_4(to,from)
697//          return;
698    default:
699      if (len_bytes <= 4096) {
700        MVC_MULTI(to,from,len_bytes)
701        return;
702      }
703      // else
704      MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
705      return;
706  }
707#else
708  // Fallback code.
709  switch (count) {
710    case 0:
711      return;
712
713    case 1:
714      *to = *from;
715      return;
716
717    case 2:
718      *to++ = *from++;
719      *to = *from;
720      return;
721
722    case 3:
723      *to++ = *from++;
724      *to++ = *from++;
725      *to = *from;
726      return;
727
728    case 4:
729      *to++ = *from++;
730      *to++ = *from++;
731      *to++ = *from++;
732      *to = *from;
733      return;
734
735    default:
736      while (count-- > 0)
737        *(to++) = *(from++);
738      return;
739  }
740#endif
741}
742
743static void pd_disjoint_words_atomic(HeapWord* from, HeapWord* to, size_t count) {
744  // JVM2008: < 4k calls.
745  assert(((((size_t)from) & 0x07L) | (((size_t)to) & 0x07L)) == 0, "No atomic copy w/o aligned data");
746  pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate.
747}
748
749static void pd_disjoint_words(HeapWord* from, HeapWord* to, size_t count) {
750  // JVM2008: very rare.
751  pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate.
752}
753
754
755//*************************************//
756//   C O N J O I N T   C O P Y I N G   //
757//*************************************//
758
759static void pd_aligned_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
760  // JVM2008: between some and lower end of frequent.
761
762#ifdef USE_INLINE_ASM
763  size_t  count_in = count;
764  if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
765    switch (count_in) {
766      case 4: COPY8_ATOMIC_4(to,from)
767              return;
768      case 3: COPY8_ATOMIC_3(to,from)
769              return;
770      case 2: COPY8_ATOMIC_2(to,from)
771              return;
772      case 1: COPY8_ATOMIC_1(to,from)
773              return;
774      case 0: return;
775      default:
776        from += count_in;
777        to   += count_in;
778        while (count_in-- > 0)
779          *(--to) = *(--from); // Copy backwards, areas overlap destructively.
780        return;
781    }
782  }
783  // else
784  jbyte* to_bytes   = (jbyte*)to;
785  jbyte* from_bytes = (jbyte*)from;
786  size_t len_bytes  = count_in*BytesPerLong;
787  MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
788  return;
789#else
790  // Fallback code.
791  if (has_destructive_overlap((char*)from, (char*)to, count*BytesPerLong)) {
792    HeapWord t1, t2, t3;
793    switch (count) {
794      case 0:
795        return;
796
797      case 1:
798        *to = *from;
799        return;
800
801      case 2:
802        t1 = *(from+1);
803        *to = *from;
804        *(to+1) = t1;
805        return;
806
807      case 3:
808        t1 = *(from+1);
809        t2 = *(from+2);
810        *to = *from;
811        *(to+1) = t1;
812        *(to+2) = t2;
813        return;
814
815      case 4:
816        t1 = *(from+1);
817        t2 = *(from+2);
818        t3 = *(from+3);
819        *to = *from;
820        *(to+1) = t1;
821        *(to+2) = t2;
822        *(to+3) = t3;
823        return;
824
825      default:
826        from += count;
827        to   += count;
828        while (count-- > 0)
829          *(--to) = *(--from); // Copy backwards, areas overlap destructively.
830        return;
831    }
832  }
833  // else
834  // Just delegate. HeapWords are optimally aligned anyway.
835  pd_aligned_disjoint_words(from, to, count);
836#endif
837}
838
839static void pd_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
840
841  // Just delegate. HeapWords are optimally aligned anyway.
842  pd_aligned_conjoint_words(from, to, count);
843}
844
845static void pd_conjoint_bytes(void* from, void* to, size_t count) {
846
847#ifdef USE_INLINE_ASM
848  size_t count_in = count;
849  if (has_destructive_overlap((char*)from, (char*)to, count_in))
850    (void)memmove(to, from, count_in);
851  else {
852    jbyte*  to_bytes   = (jbyte*)to;
853    jbyte*  from_bytes = (jbyte*)from;
854    size_t  len_bytes  = count_in;
855    MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
856  }
857#else
858  if (has_destructive_overlap((char*)from, (char*)to, count))
859    (void)memmove(to, from, count);
860  else
861    (void)memcpy(to, from, count);
862#endif
863}
864
865//**************************************************//
866//   C O N J O I N T  A T O M I C   C O P Y I N G   //
867//**************************************************//
868
869static void pd_conjoint_bytes_atomic(void* from, void* to, size_t count) {
870  // Call arraycopy stubs to do the job.
871  pd_conjoint_bytes(from, to, count); // bytes are always accessed atomically.
872}
873
874static void pd_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) {
875
876#ifdef USE_INLINE_ASM
877  size_t count_in = count;
878  if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerShort)) {
879    // Use optimizations from shared code where no z-specific optimization exists.
880    copy_conjoint_jshorts_atomic(from, to, count);
881  } else {
882    jbyte* to_bytes   = (jbyte*)to;
883    jbyte* from_bytes = (jbyte*)from;
884    size_t len_bytes  = count_in*BytesPerShort;
885    MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
886  }
887#else
888  // Use optimizations from shared code where no z-specific optimization exists.
889  copy_conjoint_jshorts_atomic(from, to, count);
890#endif
891}
892
893static void pd_conjoint_jints_atomic(jint* from, jint* to, size_t count) {
894
895#ifdef USE_INLINE_ASM
896  size_t count_in = count;
897  if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerInt)) {
898    switch (count_in) {
899      case 4: COPY4_ATOMIC_4(to,from)
900              return;
901      case 3: COPY4_ATOMIC_3(to,from)
902              return;
903      case 2: COPY4_ATOMIC_2(to,from)
904              return;
905      case 1: COPY4_ATOMIC_1(to,from)
906              return;
907      case 0: return;
908      default:
909        // Use optimizations from shared code where no z-specific optimization exists.
910        copy_conjoint_jints_atomic(from, to, count_in);
911        return;
912    }
913  }
914  // else
915  jbyte* to_bytes   = (jbyte*)to;
916  jbyte* from_bytes = (jbyte*)from;
917  size_t len_bytes  = count_in*BytesPerInt;
918  MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
919#else
920  // Use optimizations from shared code where no z-specific optimization exists.
921  copy_conjoint_jints_atomic(from, to, count);
922#endif
923}
924
925static void pd_conjoint_jlongs_atomic(jlong* from, jlong* to, size_t count) {
926
927#ifdef USE_INLINE_ASM
928  size_t count_in = count;
929  if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
930    switch (count_in) {
931      case 4: COPY8_ATOMIC_4(to,from) return;
932      case 3: COPY8_ATOMIC_3(to,from) return;
933      case 2: COPY8_ATOMIC_2(to,from) return;
934      case 1: COPY8_ATOMIC_1(to,from) return;
935      case 0: return;
936      default:
937        from += count_in;
938        to   += count_in;
939        while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively.
940        return;
941    }
942  }
943  // else {
944  jbyte* to_bytes   = (jbyte*)to;
945  jbyte* from_bytes = (jbyte*)from;
946  size_t len_bytes  = count_in*BytesPerLong;
947  MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
948#else
949  size_t count_in = count;
950  if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
951    if (count_in < 8) {
952      from += count_in;
953      to   += count_in;
954      while (count_in-- > 0)
955         *(--to) = *(--from); // Copy backwards, areas overlap destructively.
956      return;
957    }
958    // else {
959    from += count_in-1;
960    to   += count_in-1;
961    if (count_in&0x01) {
962      *(to--) = *(from--);
963      count_in--;
964    }
965    for (; count_in>0; count_in-=2) {
966      *to     = *from;
967      *(to-1) = *(from-1);
968      to     -= 2;
969      from   -= 2;
970    }
971  }
972  else
973    pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate.
974#endif
975}
976
977static void pd_conjoint_oops_atomic(oop* from, oop* to, size_t count) {
978
979#ifdef USE_INLINE_ASM
980  size_t count_in = count;
981  if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) {
982    switch (count_in) {
983      case 4: COPY8_ATOMIC_4(to,from) return;
984      case 3: COPY8_ATOMIC_3(to,from) return;
985      case 2: COPY8_ATOMIC_2(to,from) return;
986      case 1: COPY8_ATOMIC_1(to,from) return;
987      case 0: return;
988      default:
989        from += count_in;
990        to   += count_in;
991        while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively.
992        return;
993    }
994  }
995  // else
996  jbyte* to_bytes   = (jbyte*)to;
997  jbyte* from_bytes = (jbyte*)from;
998  size_t len_bytes  = count_in*BytesPerOop;
999  MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
1000#else
1001  size_t count_in = count;
1002  if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) {
1003    from += count_in;
1004    to   += count_in;
1005    while (count_in-- > 0) *(--to) = *(--from); // Copy backwards, areas overlap destructively.
1006    return;
1007  }
1008  // else
1009  pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate.
1010  return;
1011#endif
1012}
1013
1014static void pd_arrayof_conjoint_bytes(HeapWord* from, HeapWord* to, size_t count) {
1015  pd_conjoint_bytes_atomic(from, to, count);
1016}
1017
1018static void pd_arrayof_conjoint_jshorts(HeapWord* from, HeapWord* to, size_t count) {
1019  pd_conjoint_jshorts_atomic((jshort*)from, (jshort*)to, count);
1020}
1021
1022static void pd_arrayof_conjoint_jints(HeapWord* from, HeapWord* to, size_t count) {
1023  pd_conjoint_jints_atomic((jint*)from, (jint*)to, count);
1024}
1025
1026static void pd_arrayof_conjoint_jlongs(HeapWord* from, HeapWord* to, size_t count) {
1027  pd_conjoint_jlongs_atomic((jlong*)from, (jlong*)to, count);
1028}
1029
1030static void pd_arrayof_conjoint_oops(HeapWord* from, HeapWord* to, size_t count) {
1031  pd_conjoint_oops_atomic((oop*)from, (oop*)to, count);
1032}
1033
1034//**********************************************//
1035//  M E M O R Y   I N I T I A L I S A T I O N   //
1036//**********************************************//
1037
1038static void pd_fill_to_bytes(void* to, size_t count, jubyte value) {
1039  // JVM2008: very rare, only in some tests.
1040#ifdef USE_INLINE_ASM
1041  // Initialize storage to a given value. Use memset instead of copy loop.
1042  // For large chunks of memory, exploit special H/W support of z/Architecture:
1043  // 1) init short piece of memory to page-align address
1044  // 2) init largest part (all contained full pages) of memory using mvcle instruction.
1045  //    z/Architecture processors have special H/W support for page-aligned storage
1046  //    where len is an int multiple of page size. In that case, up to 4 cache lines are
1047  //    processed in parallel and L1 cache is not polluted.
1048  // 3) init the remaining piece of memory.
1049  // Atomicity cannot really be an issue since gcc implements the loop body with XC anyway.
1050  // If atomicity is a problem, we have to prevent gcc optimization. Best workaround: inline asm.
1051
1052  jbyte*  to_bytes  = (jbyte*)to;
1053  size_t  len_bytes = count;
1054
1055  MVCLE_MEMINIT(to_bytes, value, len_bytes)
1056
1057#else
1058  // Memset does the best job possible: loop over 256-byte MVCs, with
1059  // the last MVC EXecuted. With the -mmvcle option, initialization
1060  // is done using MVCLE -> slight advantage for large areas.
1061  (void)memset(to, value, count);
1062#endif
1063}
1064
1065static void pd_fill_to_words(HeapWord* tohw, size_t count, juint value) {
1066  // Occurs in dbg builds only. Usually memory poisoning with BAADBABE, DEADBEEF, etc.
1067  // JVM2008: < 4k calls.
1068  if (value == 0) {
1069    pd_zero_to_words(tohw, count);
1070    return;
1071  }
1072  if (value == ~(juint)(0)) {
1073    pd_fill_to_bytes(tohw, count*HeapWordSize, (jubyte)(~(juint)(0)));
1074    return;
1075  }
1076  julong* to = (julong*) tohw;
1077  julong  v  = ((julong) value << 32) | value;
1078  while (count-- > 0) {
1079    *to++ = v;
1080  }
1081}
1082
1083static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) {
1084  // JVM2008: very frequent, but virtually all calls are with value == 0.
1085  pd_fill_to_words(tohw, count, value);
1086}
1087
1088//**********************************//
1089//  M E M O R Y   C L E A R I N G   //
1090//**********************************//
1091
1092// Delegate to pd_zero_to_bytes. It also works HeapWord-atomic.
1093// Distinguish between simple and large zero_to_words.
1094static void pd_zero_to_words(HeapWord* tohw, size_t count) {
1095  pd_zero_to_bytes(tohw, count*HeapWordSize);
1096}
1097
1098// Delegate to pd_zero_to_bytes. It also works HeapWord-atomic.
1099static void pd_zero_to_words_large(HeapWord* tohw, size_t count) {
1100  // JVM2008: generally frequent, some tests show very frequent calls.
1101  pd_zero_to_bytes(tohw, count*HeapWordSize);
1102}
1103
1104static void pd_zero_to_bytes(void* to, size_t count) {
1105  // JVM2008: some calls (generally), some tests frequent
1106#ifdef USE_INLINE_ASM
1107  // Even zero_to_bytes() requires HeapWord-atomic, or, at least, sequential
1108  // zeroing of the memory. MVCLE is not fit for that job:
1109  //   "As observed by other CPUs and by the channel subsystem,
1110  //    that portion of the first operand which is filled
1111  //    with the padding byte is not necessarily stored into in
1112  //    a left-to-right direction and may appear to be stored
1113  //    into more than once."
1114  // Therefore, implementation was changed to use (multiple) XC instructions.
1115
1116  const long line_size = 256;
1117  jbyte* to_bytes  = (jbyte*)to;
1118  size_t len_bytes = count;
1119
1120  if (len_bytes <= line_size) {
1121    XC_MEMZERO_256(to_bytes, len_bytes);
1122  } else {
1123    XC_MEMZERO_ANY(to_bytes, len_bytes);
1124  }
1125
1126#else
1127  // Memset does the best job possible: loop over 256-byte MVCs, with
1128  // the last MVC EXecuted. With the -mmvcle option, initialization
1129  // is done using MVCLE -> slight advantage for large areas.
1130  (void)memset(to, 0, count);
1131#endif
1132}
1133
1134#endif // CPU_S390_VM_COPY_S390_HPP
1135