1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <machine/cpu_capabilities.h>
30#include <machine/commpage.h>
31
32/*
33 * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with
34 * Supplemental SSE3 and 64-byte cache lines.  This is the 64-bit version.
35 *
36 * The following #defines are tightly coupled to the u-architecture:
37 */
38
39#define kShort  80			// too short to bother with SSE (must be >=80)
40#define kVeryLong   (500*1024)          // large enough for non-temporal stores (>=8192 and <2GB)
41#define kFastUCode  ((16*1024)-15)	// cutoff for microcode fastpath for "rep/movsl"
42
43
44// void bcopy(const void *src, void *dst, size_t len);
45
46        .text
47	.code64
48        .align 5, 0x90
49LZero:
50Lbcopy_sse3x_64:				// void bcopy(const void *src, void *dst, size_t len)
51	pushq	%rbp			// set up a frame for backtraces
52	movq	%rsp,%rbp
53	movq	%rsi,%rax		// copy dest ptr
54	movq	%rdi,%rsi		// xchange source and dest ptrs
55	movq	%rax,%rdi
56        subq    %rsi,%rax               // (dest - source)
57        cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
58        jb      LReverseIsland
59        cmpq    $(kShort),%rdx          // long enough to bother with SSE?
60        jbe     LShort			// no
61	jmp	LNotShort
62
63//
64// void *memcpy(void *dst, const void *src, size_t len);
65// void *memmove(void *dst, const void *src, size_t len);
66//
67// NB: These need to be 32 bytes from bcopy():
68//
69
70        .align	5, 0x90
71Lmemcpy:				// void *memcpy(void *dst, const void *src, size_t len)
72Lmemmove:				// void *memmove(void *dst, const void *src, size_t len)
73	pushq	%rbp			// set up a frame for backtraces
74	movq	%rsp,%rbp
75	movq	%rdi,%r11		// save return value here
76        movq    %rdi,%rax
77        subq    %rsi,%rax               // (dest - source)
78        cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
79        jb      LReverseIsland
80        cmpq    $(kShort),%rdx          // long enough to bother with SSE?
81        ja      LNotShort               // yes
82
83// Handle short forward copies.  As the most common case, this is the fall-through path.
84//      rdx = length (<= kShort)
85//      rsi = source ptr
86//      rdi = dest ptr
87
88LShort:
89	movl    %edx,%ecx		// copy length using 32-bit operation
90	shrl	$2,%ecx			// get #doublewords
91	jz	LLeftovers
922:					// loop copying doublewords
93	movl	(%rsi),%eax
94	addq	$4,%rsi
95	movl	%eax,(%rdi)
96	addq	$4,%rdi
97	decl	%ecx
98	jnz	2b
99LLeftovers:				// handle leftover bytes (0..3) in last word
100	andl	$3,%edx			// any leftover bytes?
101	jz	5f
1024:					// loop copying bytes
103	movb	(%rsi),%al
104	incq	%rsi
105	movb	%al,(%rdi)
106	incq	%rdi
107	decl	%edx
108	jnz	4b
1095:
110        movq	%r11,%rax		// get return value (dst ptr) for memcpy/memmove
111	popq	%rbp
112        ret
113
114
115LReverseIsland:				// keep the "jb" above a short branch...
116	jmp	LReverse		// ...because reverse moves are uncommon
117
118
119// Handle forward moves that are long enough to justify use of SSE.
120// First, 16-byte align the destination.
121//      rdx = length (> kShort)
122//      rsi = source ptr
123//      rdi = dest ptr
124
125LNotShort:
126        cmpq    $(kVeryLong),%rdx       // long enough to justify heavyweight loops?
127        jae     LVeryLong		// use very-long-operand path
128        movl    %edi,%ecx               // copy low half of destination ptr
129        negl    %ecx
130        andl    $15,%ecx                // get #bytes to align destination
131	jz	LDestAligned		// already aligned
132        subl    %ecx,%edx               // decrement length
133	rep				// align destination
134	movsb
135
136
137// Destination is now aligned.  Dispatch to one of sixteen loops over 64-byte chunks,
138// based on the alignment of the source.  All vector loads and stores are aligned.
139// Even though this means we have to shift and repack vectors, doing so is much faster
140// than unaligned loads.  Since kShort>=80 and we've moved at most 15 bytes already,
141// there is at least one chunk.  When we enter the copy loops, the following registers
142// are set up:
143//      rdx = residual length (0..63)
144//	rcx = -(length to move), a multiple of 64 less than 2GB
145//      rsi = ptr to 1st source byte not to move (unaligned)
146//      rdi = ptr to 1st dest byte not to move (aligned)
147
148LDestAligned:
149        movq    %rdx,%rcx               // copy length
150	movl	%esi,%eax		// copy low half of source address
151        andl    $63,%edx                // get remaining bytes for LShort
152	andl	$15,%eax		// mask to low 4 bits of source address
153        andq    $-64,%rcx               // get number of bytes we will copy in inner loop
154// We'd like to use lea with rip-relative addressing, but cannot in a .code64 block.
155//	lea	LTable(%rip),%r8	// point to dispatch table
156	movq	$(_COMM_PAGE_32_TO_64(_COMM_PAGE_BCOPY)),%r8 // work around 4586528
157	addq	$(LTable-LZero),%r8	// work around 4586528
158        addq    %rcx,%rsi               // point to 1st byte not copied
159        addq    %rcx,%rdi
160	movl	(%r8,%rax,4),%eax	// get offset of routine
161        negq    %rcx                    // now generate offset to 1st byte to be copied
162	addq	%r8,%rax		// generate address of copy loop
163	jmp	*%rax			// enter copy loop, selected by source alignment
164
165	.align	2
166LTable:					// table of copy loop addresses
167	.long	(LMod0 - LTable)
168	.long	(LMod1 - LTable)
169	.long	(LMod2 - LTable)
170	.long	(LMod3 - LTable)
171	.long	(LMod4 - LTable)
172	.long	(LMod5 - LTable)
173	.long	(LMod6 - LTable)
174	.long	(LMod7 - LTable)
175	.long	(LMod8 - LTable)
176	.long	(LMod9 - LTable)
177	.long	(LMod10 - LTable)
178	.long	(LMod11 - LTable)
179	.long	(LMod12 - LTable)
180	.long	(LMod13 - LTable)
181	.long	(LMod14 - LTable)
182	.long	(LMod15 - LTable)
183
184
185// Very long forward moves.  These are at least several pages.  They are special cased
186// and aggressively optimized, not so much because they are common or useful, but
187// because they are subject to benchmark.  There isn't enough room for them in the
188// area reserved on the commpage for bcopy, so we put them elsewhere.  We call
189// the longcopy routine using the normal ABI:
190//      rdi = dest
191//      rsi = source
192//      rdx = length (>= kVeryLong bytes)
193
194LVeryLong:
195	pushq	%r11			// save return value
196	movq	$_COMM_PAGE_32_TO_64(_COMM_PAGE_LONGCOPY),%rax
197	call	*%rax			// call very long operand routine
198	popq	%rax			// pop return value
199	popq	%rbp
200	ret
201
202
203// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
204// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
205// about 256 bytes up to kVeryLong for cold caches.  This is because the microcode
206// avoids having to read destination cache lines that will be completely overwritten.
207// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
208// we do not know if the destination is in cache or not.
209
210Lfastpath:
211        addq    %rcx,%rsi               // restore ptrs to 1st byte of source and dest
212        addq    %rcx,%rdi
213	negl	%ecx			// make length positive (known to be < 2GB)
214	orl	%edx,%ecx		// restore total #bytes remaining to move
215	cld				// we'll move forward
216	shrl	$2,%ecx			// compute #words to move
217	rep				// the u-code will optimize this
218	movsl
219	jmp	LLeftovers		// handle 0..3 leftover bytes
220
221
222// Forward loop for medium length operands in which low four bits of %rsi == 0000
223
224LMod0:
225	cmpl	$(-kFastUCode),%ecx	// %rcx == -length, where (length < kVeryLong)
226	jle	Lfastpath		// long enough for fastpath in microcode
227	jmp	1f
228	.align	4,0x90			// 16-byte align inner loops
2291:					// loop over 64-byte chunks
230        movdqa  (%rsi,%rcx),%xmm0
231        movdqa  16(%rsi,%rcx),%xmm1
232        movdqa  32(%rsi,%rcx),%xmm2
233        movdqa  48(%rsi,%rcx),%xmm3
234
235        movdqa  %xmm0,(%rdi,%rcx)
236        movdqa  %xmm1,16(%rdi,%rcx)
237        movdqa  %xmm2,32(%rdi,%rcx)
238        movdqa  %xmm3,48(%rdi,%rcx)
239
240        addq    $64,%rcx
241        jnz     1b
242
243        jmp     LShort                  // copy remaining 0..63 bytes and done
244
245
246// Forward loop for medium length operands in which low four bits of %rsi == 0001
247
248LMod1:
249	movdqa	-1(%rsi,%rcx),%xmm0	// prime the loop by loading 1st quadword
2501:					// loop over 64-byte chunks
251        movdqa  15(%rsi,%rcx),%xmm1
252        movdqa  31(%rsi,%rcx),%xmm2
253        movdqa  47(%rsi,%rcx),%xmm3
254        movdqa  63(%rsi,%rcx),%xmm4
255
256	movdqa	%xmm0,%xmm5
257	movdqa	%xmm4,%xmm0
258
259	palignr	$1,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
260	palignr	$1,%xmm2,%xmm3
261	palignr	$1,%xmm1,%xmm2
262	palignr	$1,%xmm5,%xmm1
263
264        movdqa  %xmm1,(%rdi,%rcx)
265        movdqa  %xmm2,16(%rdi,%rcx)
266        movdqa  %xmm3,32(%rdi,%rcx)
267        movdqa  %xmm4,48(%rdi,%rcx)
268
269        addq    $64,%rcx
270        jnz     1b
271
272        jmp     LShort                  // copy remaining 0..63 bytes and done
273
274
275// Forward loop for medium length operands in which low four bits of %rsi == 0010
276
277LMod2:
278	movdqa	-2(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
2791:					// loop over 64-byte chunks
280        movdqa  14(%rsi,%rcx),%xmm1
281        movdqa  30(%rsi,%rcx),%xmm2
282        movdqa  46(%rsi,%rcx),%xmm3
283        movdqa  62(%rsi,%rcx),%xmm4
284
285	movdqa	%xmm0,%xmm5
286	movdqa	%xmm4,%xmm0
287
288	palignr	$2,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
289	palignr	$2,%xmm2,%xmm3
290	palignr	$2,%xmm1,%xmm2
291	palignr	$2,%xmm5,%xmm1
292
293        movdqa  %xmm1,(%rdi,%rcx)
294        movdqa  %xmm2,16(%rdi,%rcx)
295        movdqa  %xmm3,32(%rdi,%rcx)
296        movdqa  %xmm4,48(%rdi,%rcx)
297
298        addq    $64,%rcx
299        jnz     1b
300
301        jmp     LShort                  // copy remaining 0..63 bytes and done
302
303
304// Forward loop for medium length operands in which low four bits of %rsi == 0011
305
306LMod3:
307	movdqa	-3(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
3081:					// loop over 64-byte chunks
309        movdqa  13(%rsi,%rcx),%xmm1
310        movdqa  29(%rsi,%rcx),%xmm2
311        movdqa  45(%rsi,%rcx),%xmm3
312        movdqa  61(%rsi,%rcx),%xmm4
313
314	movdqa	%xmm0,%xmm5
315	movdqa	%xmm4,%xmm0
316
317	palignr	$3,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
318	palignr	$3,%xmm2,%xmm3
319	palignr	$3,%xmm1,%xmm2
320	palignr	$3,%xmm5,%xmm1
321
322        movdqa  %xmm1,(%rdi,%rcx)
323        movdqa  %xmm2,16(%rdi,%rcx)
324        movdqa  %xmm3,32(%rdi,%rcx)
325        movdqa  %xmm4,48(%rdi,%rcx)
326
327        addq    $64,%rcx
328        jnz     1b
329
330        jmp     LShort                  // copy remaining 0..63 bytes and done
331
332
333// Forward loop for medium length operands in which low four bits of %rsi == 0100
334// We use the float single data type in order to use "movss" to merge vectors.
335
336LMod4:
337	movaps	-4(%rsi,%rcx),%xmm0	// 4-byte aligned: prime the loop
338	jmp	1f
339	.align	4,0x90
3401:					// loop over 64-byte chunks
341        movaps  12(%rsi,%rcx),%xmm1
342        movaps  28(%rsi,%rcx),%xmm2
343	movss	%xmm1,%xmm0		// copy low 4 bytes of source into destination
344	pshufd	$(0x39),%xmm0,%xmm0	// rotate right 4 bytes (mask -- 00 11 10 01)
345        movaps  44(%rsi,%rcx),%xmm3
346	movss	%xmm2,%xmm1
347	pshufd	$(0x39),%xmm1,%xmm1
348	movaps	60(%rsi,%rcx),%xmm4
349	movss	%xmm3,%xmm2
350	pshufd	$(0x39),%xmm2,%xmm2
351
352        movaps  %xmm0,(%rdi,%rcx)
353	movss	%xmm4,%xmm3
354	pshufd	$(0x39),%xmm3,%xmm3
355        movaps  %xmm1,16(%rdi,%rcx)
356        movaps  %xmm2,32(%rdi,%rcx)
357	movaps	%xmm4,%xmm0
358        movaps  %xmm3,48(%rdi,%rcx)
359
360        addq    $64,%rcx
361        jnz     1b
362
363        jmp     LShort                  // copy remaining 0..63 bytes and done
364
365
366// Forward loop for medium length operands in which low four bits of %rsi == 0101
367
368LMod5:
369	movdqa	-5(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
3701:					// loop over 64-byte chunks
371        movdqa  11(%rsi,%rcx),%xmm1
372        movdqa  27(%rsi,%rcx),%xmm2
373        movdqa  43(%rsi,%rcx),%xmm3
374        movdqa  59(%rsi,%rcx),%xmm4
375
376	movdqa	%xmm0,%xmm5
377	movdqa	%xmm4,%xmm0
378
379	palignr	$5,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
380	palignr	$5,%xmm2,%xmm3
381	palignr	$5,%xmm1,%xmm2
382	palignr	$5,%xmm5,%xmm1
383
384        movdqa  %xmm1,(%rdi,%rcx)
385        movdqa  %xmm2,16(%rdi,%rcx)
386        movdqa  %xmm3,32(%rdi,%rcx)
387        movdqa  %xmm4,48(%rdi,%rcx)
388
389        addq    $64,%rcx
390        jnz     1b
391
392        jmp     LShort                  // copy remaining 0..63 bytes and done
393
394
395// Forward loop for medium length operands in which low four bits of %rsi == 0110
396
397LMod6:
398	movdqa	-6(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
3991:					// loop over 64-byte chunks
400        movdqa  10(%rsi,%rcx),%xmm1
401        movdqa  26(%rsi,%rcx),%xmm2
402        movdqa  42(%rsi,%rcx),%xmm3
403        movdqa  58(%rsi,%rcx),%xmm4
404
405	movdqa	%xmm0,%xmm5
406	movdqa	%xmm4,%xmm0
407
408	palignr	$6,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
409	palignr	$6,%xmm2,%xmm3
410	palignr	$6,%xmm1,%xmm2
411	palignr	$6,%xmm5,%xmm1
412
413        movdqa  %xmm1,(%rdi,%rcx)
414        movdqa  %xmm2,16(%rdi,%rcx)
415        movdqa  %xmm3,32(%rdi,%rcx)
416        movdqa  %xmm4,48(%rdi,%rcx)
417
418        addq    $64,%rcx
419        jnz     1b
420
421        jmp     LShort                  // copy remaining 0..63 bytes and done
422
423
424// Forward loop for medium length operands in which low four bits of %rsi == 0111
425
426LMod7:
427	movdqa	-7(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
4281:					// loop over 64-byte chunks
429        movdqa  9(%rsi,%rcx),%xmm1
430        movdqa  25(%rsi,%rcx),%xmm2
431        movdqa  41(%rsi,%rcx),%xmm3
432        movdqa  57(%rsi,%rcx),%xmm4
433
434	movdqa	%xmm0,%xmm5
435	movdqa	%xmm4,%xmm0
436
437	palignr	$7,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
438	palignr	$7,%xmm2,%xmm3
439	palignr	$7,%xmm1,%xmm2
440	palignr	$7,%xmm5,%xmm1
441
442        movdqa  %xmm1,(%rdi,%rcx)
443        movdqa  %xmm2,16(%rdi,%rcx)
444        movdqa  %xmm3,32(%rdi,%rcx)
445        movdqa  %xmm4,48(%rdi,%rcx)
446
447        addq    $64,%rcx
448        jnz     1b
449
450        jmp     LShort                  // copy remaining 0..63 bytes and done
451
452
453// Forward loop for medium length operands in which low four bits of %rsi == 1000
454// We use the float double data type in order to use "shufpd" to shift by 8 bytes.
455
456LMod8:
457	cmpl	$(-kFastUCode),%ecx	// %rcx == -length, where (length < kVeryLong)
458	jle	Lfastpath		// long enough for fastpath in microcode
459	movapd	-8(%rsi,%rcx),%xmm0	// 8-byte aligned: prime the loop
460	jmp	1f
461	.align	4,0x90
4621:					// loop over 64-byte chunks
463        movapd  8(%rsi,%rcx),%xmm1
464        movapd  24(%rsi,%rcx),%xmm2
465	shufpd	$01,%xmm1,%xmm0		// %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
466        movapd  40(%rsi,%rcx),%xmm3
467	shufpd	$01,%xmm2,%xmm1
468	movapd	56(%rsi,%rcx),%xmm4
469	shufpd	$01,%xmm3,%xmm2
470
471        movapd  %xmm0,(%rdi,%rcx)
472	shufpd	$01,%xmm4,%xmm3
473        movapd  %xmm1,16(%rdi,%rcx)
474        movapd  %xmm2,32(%rdi,%rcx)
475	movapd	%xmm4,%xmm0
476        movapd  %xmm3,48(%rdi,%rcx)
477
478        addq    $64,%rcx
479        jnz     1b
480
481        jmp     LShort                  // copy remaining 0..63 bytes and done
482
483
484// Forward loop for medium length operands in which low four bits of %rsi == 1001
485
486LMod9:
487	movdqa	-9(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
4881:					// loop over 64-byte chunks
489        movdqa  7(%rsi,%rcx),%xmm1
490        movdqa  23(%rsi,%rcx),%xmm2
491        movdqa  39(%rsi,%rcx),%xmm3
492        movdqa  55(%rsi,%rcx),%xmm4
493
494	movdqa	%xmm0,%xmm5
495	movdqa	%xmm4,%xmm0
496
497	palignr	$9,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
498	palignr	$9,%xmm2,%xmm3
499	palignr	$9,%xmm1,%xmm2
500	palignr	$9,%xmm5,%xmm1
501
502        movdqa  %xmm1,(%rdi,%rcx)
503        movdqa  %xmm2,16(%rdi,%rcx)
504        movdqa  %xmm3,32(%rdi,%rcx)
505        movdqa  %xmm4,48(%rdi,%rcx)
506
507        addq    $64,%rcx
508        jnz     1b
509
510        jmp     LShort                  // copy remaining 0..63 bytes and done
511
512
513// Forward loop for medium length operands in which low four bits of %rsi == 1010
514
515LMod10:
516	movdqa	-10(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
5171:					// loop over 64-byte chunks
518        movdqa  6(%rsi,%rcx),%xmm1
519        movdqa  22(%rsi,%rcx),%xmm2
520        movdqa  38(%rsi,%rcx),%xmm3
521        movdqa  54(%rsi,%rcx),%xmm4
522
523	movdqa	%xmm0,%xmm5
524	movdqa	%xmm4,%xmm0
525
526	palignr	$10,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
527	palignr	$10,%xmm2,%xmm3
528	palignr	$10,%xmm1,%xmm2
529	palignr	$10,%xmm5,%xmm1
530
531        movdqa  %xmm1,(%rdi,%rcx)
532        movdqa  %xmm2,16(%rdi,%rcx)
533        movdqa  %xmm3,32(%rdi,%rcx)
534        movdqa  %xmm4,48(%rdi,%rcx)
535
536        addq    $64,%rcx
537        jnz     1b
538
539        jmp     LShort                  // copy remaining 0..63 bytes and done
540
541
542// Forward loop for medium length operands in which low four bits of %rsi == 1011
543
544LMod11:
545	movdqa	-11(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
5461:					// loop over 64-byte chunks
547        movdqa  5(%rsi,%rcx),%xmm1
548        movdqa  21(%rsi,%rcx),%xmm2
549        movdqa  37(%rsi,%rcx),%xmm3
550        movdqa  53(%rsi,%rcx),%xmm4
551
552	movdqa	%xmm0,%xmm5
553	movdqa	%xmm4,%xmm0
554
555	palignr	$11,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
556	palignr	$11,%xmm2,%xmm3
557	palignr	$11,%xmm1,%xmm2
558	palignr	$11,%xmm5,%xmm1
559
560        movdqa  %xmm1,(%rdi,%rcx)
561        movdqa  %xmm2,16(%rdi,%rcx)
562        movdqa  %xmm3,32(%rdi,%rcx)
563        movdqa  %xmm4,48(%rdi,%rcx)
564
565        addq    $64,%rcx
566        jnz     1b
567
568        jmp     LShort                  // copy remaining 0..63 bytes and done
569
570
571// Forward loop for medium length operands in which low four bits of %rsi == 1100
572// We use the float single data type in order to use "movss" to merge vectors.
573
574LMod12:
575	movss	(%rsi,%rcx),%xmm0	// prefetch 1st four bytes of source, right justified
576	jmp	1f
577	.align	4,0x90
5781:					// loop over 64-byte chunks
579	pshufd	$(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
580	pshufd	$(0x93),20(%rsi,%rcx),%xmm2
581	pshufd	$(0x93),36(%rsi,%rcx),%xmm3
582	pshufd	$(0x93),52(%rsi,%rcx),%xmm4
583
584	movaps	%xmm4,%xmm5
585	movss	%xmm3,%xmm4		// copy low 4 bytes of source into destination
586	movss	%xmm2,%xmm3
587	movss	%xmm1,%xmm2
588	movss	%xmm0,%xmm1
589
590        movaps  %xmm1,(%rdi,%rcx)
591        movaps  %xmm2,16(%rdi,%rcx)
592	movaps	%xmm5,%xmm0
593        movaps  %xmm3,32(%rdi,%rcx)
594        movaps  %xmm4,48(%rdi,%rcx)
595
596        addq    $64,%rcx
597        jnz     1b
598
599        jmp     LShort                  // copy remaining 0..63 bytes and done
600
601
602// Forward loop for medium length operands in which low four bits of %rsi == 1101
603
604LMod13:
605	movdqa	-13(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
6061:					// loop over 64-byte chunks
607        movdqa  3(%rsi,%rcx),%xmm1
608        movdqa  19(%rsi,%rcx),%xmm2
609        movdqa  35(%rsi,%rcx),%xmm3
610        movdqa  51(%rsi,%rcx),%xmm4
611
612	movdqa	%xmm0,%xmm5
613	movdqa	%xmm4,%xmm0
614
615	palignr	$13,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
616	palignr	$13,%xmm2,%xmm3
617	palignr	$13,%xmm1,%xmm2
618	palignr	$13,%xmm5,%xmm1
619
620        movdqa  %xmm1,(%rdi,%rcx)
621        movdqa  %xmm2,16(%rdi,%rcx)
622        movdqa  %xmm3,32(%rdi,%rcx)
623        movdqa  %xmm4,48(%rdi,%rcx)
624
625        addq    $64,%rcx
626        jnz     1b
627
628        jmp     LShort                  // copy remaining 0..63 bytes and done
629
630
631// Forward loop for medium length operands in which low four bits of %rsi == 1110
632
633LMod14:
634	movdqa	-14(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
6351:					// loop over 64-byte chunks
636        movdqa  2(%rsi,%rcx),%xmm1
637        movdqa  18(%rsi,%rcx),%xmm2
638        movdqa  34(%rsi,%rcx),%xmm3
639        movdqa  50(%rsi,%rcx),%xmm4
640
641	movdqa	%xmm0,%xmm5
642	movdqa	%xmm4,%xmm0
643
644	palignr	$14,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
645	palignr	$14,%xmm2,%xmm3
646	palignr	$14,%xmm1,%xmm2
647	palignr	$14,%xmm5,%xmm1
648
649        movdqa  %xmm1,(%rdi,%rcx)
650        movdqa  %xmm2,16(%rdi,%rcx)
651        movdqa  %xmm3,32(%rdi,%rcx)
652        movdqa  %xmm4,48(%rdi,%rcx)
653
654        addq    $64,%rcx
655        jnz     1b
656
657        jmp     LShort                  // copy remaining 0..63 bytes and done
658
659
660// Forward loop for medium length operands in which low four bits of %rsi == 1111
661
662LMod15:
663	movdqa	-15(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
6641:					// loop over 64-byte chunks
665        movdqa  1(%rsi,%rcx),%xmm1
666        movdqa  17(%rsi,%rcx),%xmm2
667        movdqa  33(%rsi,%rcx),%xmm3
668        movdqa  49(%rsi,%rcx),%xmm4
669
670	movdqa	%xmm0,%xmm5
671	movdqa	%xmm4,%xmm0
672
673	palignr	$15,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
674	palignr	$15,%xmm2,%xmm3
675	palignr	$15,%xmm1,%xmm2
676	palignr	$15,%xmm5,%xmm1
677
678        movdqa  %xmm1,(%rdi,%rcx)
679        movdqa  %xmm2,16(%rdi,%rcx)
680        movdqa  %xmm3,32(%rdi,%rcx)
681        movdqa  %xmm4,48(%rdi,%rcx)
682
683        addq    $64,%rcx
684        jnz     1b
685
686        jmp     LShort                  // copy remaining 0..63 bytes and done
687
688
689// Reverse moves.  These are not optimized as aggressively as their forward
690// counterparts, as they are only used with destructive overlap.
691//      rdx = length
692//      rsi = source ptr
693//      rdi = dest ptr
694
695LReverse:
696        addq    %rdx,%rsi               // point to end of strings
697        addq    %rdx,%rdi
698        cmpq    $(kShort),%rdx          // long enough to bother with SSE?
699        ja      LReverseNotShort        // yes
700
701// Handle reverse short copies.
702//      edx = length (<= kShort)
703//      rsi = one byte past end of source
704//      rdi = one byte past end of dest
705
706LReverseShort:
707	movl    %edx,%ecx		// copy length
708	shrl	$3,%ecx			// #quadwords
709	jz	3f
7101:
711	subq	$8,%rsi
712	movq	(%rsi),%rax
713	subq	$8,%rdi
714	movq	%rax,(%rdi)
715	decl	%ecx
716	jnz	1b
7173:
718	andl	$7,%edx			// bytes?
719	jz	5f
7204:
721	decq	%rsi
722	movb	(%rsi),%al
723	decq	%rdi
724	movb	%al,(%rdi)
725	decl	%edx
726	jnz	4b
7275:
728        movq	%r11,%rax		// get return value (dst ptr) for memcpy/memmove
729	popq	%rbp
730        ret
731
732// Handle a reverse move long enough to justify using SSE.
733//      rdx = length (> kShort)
734//      rsi = one byte past end of source
735//      rdi = one byte past end of dest
736
737LReverseNotShort:
738        movl    %edi,%ecx               // copy destination
739        andl    $15,%ecx                // get #bytes to align destination
740        je      LReverseDestAligned     // already aligned
741        subq	%rcx,%rdx		// adjust length
7421:					// loop copying 1..15 bytes
743	decq	%rsi
744	movb	(%rsi),%al
745	decq	%rdi
746	movb	%al,(%rdi)
747	decl	%ecx
748	jnz	1b
749
750// Destination is now aligned.  Prepare for reverse loops.
751
752LReverseDestAligned:
753        movq    %rdx,%rcx               // copy length
754        andl    $63,%edx                // get remaining bytes for LReverseShort
755        andq    $-64,%rcx               // get number of bytes we will copy in inner loop
756        subq    %rcx,%rsi               // point to endpoint of copy
757        subq    %rcx,%rdi
758	testl	$15,%esi		// is source aligned too?
759        jnz     LReverseUnalignedLoop   // no
760
761LReverseAlignedLoop:                    // loop over 64-byte chunks
762        movdqa  -16(%rsi,%rcx),%xmm0
763        movdqa  -32(%rsi,%rcx),%xmm1
764        movdqa  -48(%rsi,%rcx),%xmm2
765        movdqa  -64(%rsi,%rcx),%xmm3
766
767        movdqa  %xmm0,-16(%rdi,%rcx)
768        movdqa  %xmm1,-32(%rdi,%rcx)
769        movdqa  %xmm2,-48(%rdi,%rcx)
770        movdqa  %xmm3,-64(%rdi,%rcx)
771
772        subq    $64,%rcx
773        jne     LReverseAlignedLoop
774
775        jmp     LReverseShort           // copy remaining 0..63 bytes and done
776
777
778// Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
779
780LReverseUnalignedLoop:                  // loop over 64-byte chunks
781        movdqu  -16(%rsi,%rcx),%xmm0
782        movdqu  -32(%rsi,%rcx),%xmm1
783        movdqu  -48(%rsi,%rcx),%xmm2
784        movdqu  -64(%rsi,%rcx),%xmm3
785
786        movdqa  %xmm0,-16(%rdi,%rcx)
787        movdqa  %xmm1,-32(%rdi,%rcx)
788        movdqa  %xmm2,-48(%rdi,%rcx)
789        movdqa  %xmm3,-64(%rdi,%rcx)
790
791        subq    $64,%rcx
792        jne     LReverseUnalignedLoop
793
794        jmp     LReverseShort           // copy remaining 0..63 bytes and done
795
796
797	COMMPAGE_DESCRIPTOR(bcopy_sse3x_64,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2)
798