1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <machine/cpu_capabilities.h>
30#include <machine/commpage.h>
31
32/*
33 * The bcopy/memcpy loops, tuned for Pentium-M class processors with
34 * Supplemental SSE3 and 64-byte cache lines.
35 *
36 * The following #defines are tightly coupled to the u-architecture:
37 */
38
39#define kShort  80			// too short to bother with SSE (must be >=80)
40#define kVeryLong   (500*1024)          // large enough for non-temporal stores (must be >= 8192)
41#define kFastUCode  ((16*1024)-15)	// cutoff for microcode fastpath for "rep/movsl"
42
43
44// void bcopy(const void *src, void *dst, size_t len);
45
46        .text
47        .align	5, 0x90
48LZero:
49Lbcopy_sse3x:				// void bcopy(const void *src, void *dst, size_t len)
50	pushl	%ebp			// set up a frame for backtraces
51	movl	%esp,%ebp
52        pushl   %esi
53        pushl   %edi
54        movl    8(%ebp),%esi		// get source ptr
55        movl    12(%ebp),%edi           // get dest ptr
56        movl    16(%ebp),%ecx           // get length
57        movl    %edi,%edx
58        subl    %esi,%edx               // (dest - source)
59        cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
60        jb      LReverseIsland
61        cmpl    $(kShort),%ecx          // long enough to bother with SSE?
62        jbe     Lshort			// no
63	jmp	LNotShort
64
65//
66// void *memcpy(void *dst, const void *src, size_t len);
67// void *memmove(void *dst, const void *src, size_t len);
68//
69// NB: These need to be 32 bytes from bcopy():
70//
71
72        .align	5, 0x90
73Lmemcpy:				// void *memcpy(void *dst, const void *src, size_t len)
74Lmemmove:				// void *memmove(void *dst, const void *src, size_t len)
75	pushl	%ebp			// set up a frame for backtraces
76	movl	%esp,%ebp
77        pushl   %esi
78        pushl   %edi
79        movl    8(%ebp),%edi		// get dest ptr
80        movl    12(%ebp),%esi           // get source ptr
81        movl    16(%ebp),%ecx           // get length
82        movl    %edi,%edx
83        subl    %esi,%edx               // (dest - source)
84        cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
85        jb      LReverseIsland
86        cmpl    $(kShort),%ecx          // long enough to bother with SSE?
87        ja      LNotShort               // yes
88
89// Handle short forward copies.  As the most common case, this is the fall-through path.
90//      ecx = length (<= kShort)
91//      esi = source ptr
92//      edi = dest ptr
93
94Lshort:
95	movl    %ecx,%edx		// copy length
96	shrl	$2,%ecx			// get #doublewords
97	jz	LLeftovers
982:					// loop copying doublewords
99	movl	(%esi),%eax
100	addl	$4,%esi
101	movl	%eax,(%edi)
102	addl	$4,%edi
103	dec	%ecx
104	jnz	2b
105LLeftovers:				// handle leftover bytes (0..3) in last word
106	andl	$3,%edx			// any leftover bytes?
107	jz	Lexit
1084:					// loop copying bytes
109	movb	(%esi),%al
110	inc	%esi
111	movb	%al,(%edi)
112	inc	%edi
113	dec	%edx
114	jnz	4b
115Lexit:
116        movl    8(%ebp),%eax		// get return value (dst ptr) for memcpy/memmove
117        popl    %edi
118        popl    %esi
119	popl	%ebp
120        ret
121
122
123LReverseIsland:				// keep the "jb" above a short branch...
124	jmp	LReverse		// ...because reverse moves are uncommon
125
126
127// Handle forward moves that are long enough to justify use of SSE3.
128// First, 16-byte align the destination.
129//      ecx = length (> kShort)
130//      esi = source ptr
131//      edi = dest ptr
132
133LNotShort:
134        cmpl    $(kVeryLong),%ecx       // long enough to justify heavyweight loops?
135        movl    %edi,%edx               // copy destination
136        jae     LVeryLong		// use very-long-operand path
137        negl    %edx
138        andl    $15,%edx                // get #bytes to align destination
139	jz	LDestAligned		// already aligned
140        subl    %edx,%ecx               // decrement length
1411:					// loop copying 1..15 bytes
142	movb	(%esi),%al
143	inc	%esi
144	movb	%al,(%edi)
145	inc	%edi
146	dec	%edx
147	jnz	1b
148
149// Destination is now aligned.  Dispatch to one of sixteen loops over 64-byte chunks,
150// based on the alignment of the source.  All vector loads and stores are aligned.
151// Even though this means we have to shift and repack vectors, doing so is much faster
152// than unaligned loads.  Since kShort>=80 and we've moved at most 15 bytes already,
153// there is at least one chunk.  When we enter the copy loops, the following registers
154// are set up:
155//      ecx = residual length (0..63)
156//	edx = -(length to move), a multiple of 64
157//      esi = ptr to 1st source byte not to move (unaligned)
158//      edi = ptr to 1st dest byte not to move (aligned)
159
160LDestAligned:
161        movl    %ecx,%edx               // copy length
162	movl	%esi,%eax		// copy source address
163        andl    $63,%ecx                // get remaining bytes for Lshort
164        andl    $-64,%edx               // get number of bytes we will copy in inner loop
165	andl	$15,%eax		// mask to low 4 bits of source address
166        addl    %edx,%esi               // point to 1st byte not copied
167        addl    %edx,%edi
168        negl    %edx                    // now generate offset to 1st byte to be copied
169	movl	(_COMM_PAGE_BCOPY+LTable-LZero)(,%eax,4),%eax
170	jmp	*%eax
171
172	.align	2
173LTable:					// table of copy loop addresses
174	.long	LMod0 + _COMM_PAGE_BCOPY - LZero
175	.long	LMod1 + _COMM_PAGE_BCOPY - LZero
176	.long	LMod2 + _COMM_PAGE_BCOPY - LZero
177	.long	LMod3 + _COMM_PAGE_BCOPY - LZero
178	.long	LMod4 + _COMM_PAGE_BCOPY - LZero
179	.long	LMod5 + _COMM_PAGE_BCOPY - LZero
180	.long	LMod6 + _COMM_PAGE_BCOPY - LZero
181	.long	LMod7 + _COMM_PAGE_BCOPY - LZero
182	.long	LMod8 + _COMM_PAGE_BCOPY - LZero
183	.long	LMod9 + _COMM_PAGE_BCOPY - LZero
184	.long	LMod10 + _COMM_PAGE_BCOPY - LZero
185	.long	LMod11 + _COMM_PAGE_BCOPY - LZero
186	.long	LMod12 + _COMM_PAGE_BCOPY - LZero
187	.long	LMod13 + _COMM_PAGE_BCOPY - LZero
188	.long	LMod14 + _COMM_PAGE_BCOPY - LZero
189	.long	LMod15 + _COMM_PAGE_BCOPY - LZero
190
191
192// Very long forward moves.  These are at least several pages.  They are special cased
193// and aggressively optimized, not so much because they are common or useful, but
194// because they are subject to benchmark.  There isn't enough room for them in the
195// area reserved on the commpage for bcopy, so we put them elsewhere.  We call
196// the longcopy routine using the normal ABI.
197
198LVeryLong:
199	pushl	%ecx			// length (>= kVeryLong)
200	pushl	%esi			// source ptr
201	pushl	%edi			// dest ptr
202	movl	$(_COMM_PAGE_LONGCOPY),%eax
203	call	*%eax			// do the long copy
204	addl	$12,%esp		// pop off our parameters
205	jmp	Lexit
206
207
208// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
209// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
210// about 256 bytes up to kVeryLong for cold caches.  This is because the microcode
211// avoids having to read destination cache lines that will be completely overwritten.
212// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
213// we do not know if the destination is in cache or not.
214
215Lfastpath:
216        addl    %edx,%esi               // restore ptrs to 1st byte of source and dest
217        addl    %edx,%edi
218	negl	%edx			// make length positive
219	orl	%edx,%ecx		// restore total #bytes remaining to move
220	cld				// we'll move forward
221	movl	%ecx,%edx		// copy total length to move
222	shrl	$2,%ecx			// compute #words to move
223	rep				// the u-code will optimize this
224	movsl
225	jmp	LLeftovers		// handle 0..3 leftover bytes
226
227
228// Forward loop for medium length operands in which low four bits of %esi == 0000
229
230LMod0:
231	cmpl	$(-kFastUCode),%edx	// %edx == -length, where (length < kVeryLong)
232	jle	Lfastpath		// long enough for fastpath in microcode
233	jmp	1f
234	.align	4,0x90			// 16-byte align inner loops
2351:					// loop over 64-byte chunks
236        movdqa  (%esi,%edx),%xmm0
237        movdqa  16(%esi,%edx),%xmm1
238        movdqa  32(%esi,%edx),%xmm2
239        movdqa  48(%esi,%edx),%xmm3
240
241        movdqa  %xmm0,(%edi,%edx)
242        movdqa  %xmm1,16(%edi,%edx)
243        movdqa  %xmm2,32(%edi,%edx)
244        movdqa  %xmm3,48(%edi,%edx)
245
246        addl    $64,%edx
247        jnz     1b
248
249        jmp     Lshort                  // copy remaining 0..63 bytes and done
250
251
252// Forward loop for medium length operands in which low four bits of %esi == 0001
253
254LMod1:
255	movdqa	-1(%esi,%edx),%xmm0	// prime the loop by loading 1st quadword
2561:					// loop over 64-byte chunks
257        movdqa  15(%esi,%edx),%xmm1
258        movdqa  31(%esi,%edx),%xmm2
259        movdqa  47(%esi,%edx),%xmm3
260        movdqa  63(%esi,%edx),%xmm4
261
262	movdqa	%xmm0,%xmm5
263	movdqa	%xmm4,%xmm0
264
265	palignr	$1,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
266	palignr	$1,%xmm2,%xmm3
267	palignr	$1,%xmm1,%xmm2
268	palignr	$1,%xmm5,%xmm1
269
270        movdqa  %xmm1,(%edi,%edx)
271        movdqa  %xmm2,16(%edi,%edx)
272        movdqa  %xmm3,32(%edi,%edx)
273        movdqa  %xmm4,48(%edi,%edx)
274
275        addl    $64,%edx
276        jnz     1b
277
278        jmp     Lshort                  // copy remaining 0..63 bytes and done
279
280
281// Forward loop for medium length operands in which low four bits of %esi == 0010
282
283LMod2:
284	movdqa	-2(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
2851:					// loop over 64-byte chunks
286        movdqa  14(%esi,%edx),%xmm1
287        movdqa  30(%esi,%edx),%xmm2
288        movdqa  46(%esi,%edx),%xmm3
289        movdqa  62(%esi,%edx),%xmm4
290
291	movdqa	%xmm0,%xmm5
292	movdqa	%xmm4,%xmm0
293
294	palignr	$2,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
295	palignr	$2,%xmm2,%xmm3
296	palignr	$2,%xmm1,%xmm2
297	palignr	$2,%xmm5,%xmm1
298
299        movdqa  %xmm1,(%edi,%edx)
300        movdqa  %xmm2,16(%edi,%edx)
301        movdqa  %xmm3,32(%edi,%edx)
302        movdqa  %xmm4,48(%edi,%edx)
303
304        addl    $64,%edx
305        jnz     1b
306
307        jmp     Lshort                  // copy remaining 0..63 bytes and done
308
309
310// Forward loop for medium length operands in which low four bits of %esi == 0011
311
312LMod3:
313	movdqa	-3(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
3141:					// loop over 64-byte chunks
315        movdqa  13(%esi,%edx),%xmm1
316        movdqa  29(%esi,%edx),%xmm2
317        movdqa  45(%esi,%edx),%xmm3
318        movdqa  61(%esi,%edx),%xmm4
319
320	movdqa	%xmm0,%xmm5
321	movdqa	%xmm4,%xmm0
322
323	palignr	$3,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
324	palignr	$3,%xmm2,%xmm3
325	palignr	$3,%xmm1,%xmm2
326	palignr	$3,%xmm5,%xmm1
327
328        movdqa  %xmm1,(%edi,%edx)
329        movdqa  %xmm2,16(%edi,%edx)
330        movdqa  %xmm3,32(%edi,%edx)
331        movdqa  %xmm4,48(%edi,%edx)
332
333        addl    $64,%edx
334        jnz     1b
335
336        jmp     Lshort                  // copy remaining 0..63 bytes and done
337
338
339// Forward loop for medium length operands in which low four bits of %esi == 0100
340// We use the float single data type in order to use "movss" to merge vectors.
341
342LMod4:
343	movaps	-4(%esi,%edx),%xmm0	// 4-byte aligned: prime the loop
344	jmp	1f
345	.align	4,0x90
3461:					// loop over 64-byte chunks
347        movaps  12(%esi,%edx),%xmm1
348        movaps  28(%esi,%edx),%xmm2
349	movss	%xmm1,%xmm0		// copy low 4 bytes of source into destination
350	pshufd	$(0x39),%xmm0,%xmm0	// rotate right 4 bytes (mask -- 00 11 10 01)
351        movaps  44(%esi,%edx),%xmm3
352	movss	%xmm2,%xmm1
353	pshufd	$(0x39),%xmm1,%xmm1
354	movaps	60(%esi,%edx),%xmm4
355	movss	%xmm3,%xmm2
356	pshufd	$(0x39),%xmm2,%xmm2
357
358        movaps  %xmm0,(%edi,%edx)
359	movss	%xmm4,%xmm3
360	pshufd	$(0x39),%xmm3,%xmm3
361        movaps  %xmm1,16(%edi,%edx)
362        movaps  %xmm2,32(%edi,%edx)
363	movaps	%xmm4,%xmm0
364        movaps  %xmm3,48(%edi,%edx)
365
366        addl    $64,%edx
367        jnz     1b
368
369        jmp     Lshort                  // copy remaining 0..63 bytes and done
370
371
372// Forward loop for medium length operands in which low four bits of %esi == 0101
373
374LMod5:
375	movdqa	-5(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
3761:					// loop over 64-byte chunks
377        movdqa  11(%esi,%edx),%xmm1
378        movdqa  27(%esi,%edx),%xmm2
379        movdqa  43(%esi,%edx),%xmm3
380        movdqa  59(%esi,%edx),%xmm4
381
382	movdqa	%xmm0,%xmm5
383	movdqa	%xmm4,%xmm0
384
385	palignr	$5,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
386	palignr	$5,%xmm2,%xmm3
387	palignr	$5,%xmm1,%xmm2
388	palignr	$5,%xmm5,%xmm1
389
390        movdqa  %xmm1,(%edi,%edx)
391        movdqa  %xmm2,16(%edi,%edx)
392        movdqa  %xmm3,32(%edi,%edx)
393        movdqa  %xmm4,48(%edi,%edx)
394
395        addl    $64,%edx
396        jnz     1b
397
398        jmp     Lshort                  // copy remaining 0..63 bytes and done
399
400
401// Forward loop for medium length operands in which low four bits of %esi == 0110
402
403LMod6:
404	movdqa	-6(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
4051:					// loop over 64-byte chunks
406        movdqa  10(%esi,%edx),%xmm1
407        movdqa  26(%esi,%edx),%xmm2
408        movdqa  42(%esi,%edx),%xmm3
409        movdqa  58(%esi,%edx),%xmm4
410
411	movdqa	%xmm0,%xmm5
412	movdqa	%xmm4,%xmm0
413
414	palignr	$6,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
415	palignr	$6,%xmm2,%xmm3
416	palignr	$6,%xmm1,%xmm2
417	palignr	$6,%xmm5,%xmm1
418
419        movdqa  %xmm1,(%edi,%edx)
420        movdqa  %xmm2,16(%edi,%edx)
421        movdqa  %xmm3,32(%edi,%edx)
422        movdqa  %xmm4,48(%edi,%edx)
423
424        addl    $64,%edx
425        jnz     1b
426
427        jmp     Lshort                  // copy remaining 0..63 bytes and done
428
429
430// Forward loop for medium length operands in which low four bits of %esi == 0111
431
432LMod7:
433	movdqa	-7(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
4341:					// loop over 64-byte chunks
435        movdqa  9(%esi,%edx),%xmm1
436        movdqa  25(%esi,%edx),%xmm2
437        movdqa  41(%esi,%edx),%xmm3
438        movdqa  57(%esi,%edx),%xmm4
439
440	movdqa	%xmm0,%xmm5
441	movdqa	%xmm4,%xmm0
442
443	palignr	$7,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
444	palignr	$7,%xmm2,%xmm3
445	palignr	$7,%xmm1,%xmm2
446	palignr	$7,%xmm5,%xmm1
447
448        movdqa  %xmm1,(%edi,%edx)
449        movdqa  %xmm2,16(%edi,%edx)
450        movdqa  %xmm3,32(%edi,%edx)
451        movdqa  %xmm4,48(%edi,%edx)
452
453        addl    $64,%edx
454        jnz     1b
455
456        jmp     Lshort                  // copy remaining 0..63 bytes and done
457
458
459// Forward loop for medium length operands in which low four bits of %esi == 1000
460// We use the float double data type in order to use "shufpd" to shift by 8 bytes.
461
462LMod8:
463	cmpl	$(-kFastUCode),%edx	// %edx == -length, where (length < kVeryLong)
464	jle	Lfastpath		// long enough for fastpath in microcode
465	movapd	-8(%esi,%edx),%xmm0	// 8-byte aligned: prime the loop
466	jmp	1f
467	.align	4,0x90
4681:					// loop over 64-byte chunks
469        movapd  8(%esi,%edx),%xmm1
470        movapd  24(%esi,%edx),%xmm2
471	shufpd	$01,%xmm1,%xmm0		// %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
472        movapd  40(%esi,%edx),%xmm3
473	shufpd	$01,%xmm2,%xmm1
474	movapd	56(%esi,%edx),%xmm4
475	shufpd	$01,%xmm3,%xmm2
476
477        movapd  %xmm0,(%edi,%edx)
478	shufpd	$01,%xmm4,%xmm3
479        movapd  %xmm1,16(%edi,%edx)
480        movapd  %xmm2,32(%edi,%edx)
481	movapd	%xmm4,%xmm0
482        movapd  %xmm3,48(%edi,%edx)
483
484        addl    $64,%edx
485        jnz     1b
486
487        jmp     Lshort                  // copy remaining 0..63 bytes and done
488
489
490// Forward loop for medium length operands in which low four bits of %esi == 1001
491
492LMod9:
493	movdqa	-9(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
4941:					// loop over 64-byte chunks
495        movdqa  7(%esi,%edx),%xmm1
496        movdqa  23(%esi,%edx),%xmm2
497        movdqa  39(%esi,%edx),%xmm3
498        movdqa  55(%esi,%edx),%xmm4
499
500	movdqa	%xmm0,%xmm5
501	movdqa	%xmm4,%xmm0
502
503	palignr	$9,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
504	palignr	$9,%xmm2,%xmm3
505	palignr	$9,%xmm1,%xmm2
506	palignr	$9,%xmm5,%xmm1
507
508        movdqa  %xmm1,(%edi,%edx)
509        movdqa  %xmm2,16(%edi,%edx)
510        movdqa  %xmm3,32(%edi,%edx)
511        movdqa  %xmm4,48(%edi,%edx)
512
513        addl    $64,%edx
514        jnz     1b
515
516        jmp     Lshort                  // copy remaining 0..63 bytes and done
517
518
519// Forward loop for medium length operands in which low four bits of %esi == 1010
520
521LMod10:
522	movdqa	-10(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
5231:					// loop over 64-byte chunks
524        movdqa  6(%esi,%edx),%xmm1
525        movdqa  22(%esi,%edx),%xmm2
526        movdqa  38(%esi,%edx),%xmm3
527        movdqa  54(%esi,%edx),%xmm4
528
529	movdqa	%xmm0,%xmm5
530	movdqa	%xmm4,%xmm0
531
532	palignr	$10,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
533	palignr	$10,%xmm2,%xmm3
534	palignr	$10,%xmm1,%xmm2
535	palignr	$10,%xmm5,%xmm1
536
537        movdqa  %xmm1,(%edi,%edx)
538        movdqa  %xmm2,16(%edi,%edx)
539        movdqa  %xmm3,32(%edi,%edx)
540        movdqa  %xmm4,48(%edi,%edx)
541
542        addl    $64,%edx
543        jnz     1b
544
545        jmp     Lshort                  // copy remaining 0..63 bytes and done
546
547
548// Forward loop for medium length operands in which low four bits of %esi == 1011
549
550LMod11:
551	movdqa	-11(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
5521:					// loop over 64-byte chunks
553        movdqa  5(%esi,%edx),%xmm1
554        movdqa  21(%esi,%edx),%xmm2
555        movdqa  37(%esi,%edx),%xmm3
556        movdqa  53(%esi,%edx),%xmm4
557
558	movdqa	%xmm0,%xmm5
559	movdqa	%xmm4,%xmm0
560
561	palignr	$11,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
562	palignr	$11,%xmm2,%xmm3
563	palignr	$11,%xmm1,%xmm2
564	palignr	$11,%xmm5,%xmm1
565
566        movdqa  %xmm1,(%edi,%edx)
567        movdqa  %xmm2,16(%edi,%edx)
568        movdqa  %xmm3,32(%edi,%edx)
569        movdqa  %xmm4,48(%edi,%edx)
570
571        addl    $64,%edx
572        jnz     1b
573
574        jmp     Lshort                  // copy remaining 0..63 bytes and done
575
576
577// Forward loop for medium length operands in which low four bits of %esi == 1100
578// We use the float single data type in order to use "movss" to merge vectors.
579
580LMod12:
581	movss	(%esi,%edx),%xmm0	// prefetch 1st four bytes of source, right justified
582	jmp	1f
583	.align	4,0x90
5841:					// loop over 64-byte chunks
585	pshufd	$(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
586	pshufd	$(0x93),20(%esi,%edx),%xmm2
587	pshufd	$(0x93),36(%esi,%edx),%xmm3
588	pshufd	$(0x93),52(%esi,%edx),%xmm4
589
590	movaps	%xmm4,%xmm5
591	movss	%xmm3,%xmm4		// copy low 4 bytes of source into destination
592	movss	%xmm2,%xmm3
593	movss	%xmm1,%xmm2
594	movss	%xmm0,%xmm1
595
596        movaps  %xmm1,(%edi,%edx)
597        movaps  %xmm2,16(%edi,%edx)
598	movaps	%xmm5,%xmm0
599        movaps  %xmm3,32(%edi,%edx)
600        movaps  %xmm4,48(%edi,%edx)
601
602        addl    $64,%edx
603        jnz     1b
604
605        jmp     Lshort                  // copy remaining 0..63 bytes and done
606
607
608// Forward loop for medium length operands in which low four bits of %esi == 1101
609
610LMod13:
611	movdqa	-13(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
6121:					// loop over 64-byte chunks
613        movdqa  3(%esi,%edx),%xmm1
614        movdqa  19(%esi,%edx),%xmm2
615        movdqa  35(%esi,%edx),%xmm3
616        movdqa  51(%esi,%edx),%xmm4
617
618	movdqa	%xmm0,%xmm5
619	movdqa	%xmm4,%xmm0
620
621	palignr	$13,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
622	palignr	$13,%xmm2,%xmm3
623	palignr	$13,%xmm1,%xmm2
624	palignr	$13,%xmm5,%xmm1
625
626        movdqa  %xmm1,(%edi,%edx)
627        movdqa  %xmm2,16(%edi,%edx)
628        movdqa  %xmm3,32(%edi,%edx)
629        movdqa  %xmm4,48(%edi,%edx)
630
631        addl    $64,%edx
632        jnz     1b
633
634        jmp     Lshort                  // copy remaining 0..63 bytes and done
635
636
637// Forward loop for medium length operands in which low four bits of %esi == 1110
638
639LMod14:
640	movdqa	-14(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
6411:					// loop over 64-byte chunks
642        movdqa  2(%esi,%edx),%xmm1
643        movdqa  18(%esi,%edx),%xmm2
644        movdqa  34(%esi,%edx),%xmm3
645        movdqa  50(%esi,%edx),%xmm4
646
647	movdqa	%xmm0,%xmm5
648	movdqa	%xmm4,%xmm0
649
650	palignr	$14,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
651	palignr	$14,%xmm2,%xmm3
652	palignr	$14,%xmm1,%xmm2
653	palignr	$14,%xmm5,%xmm1
654
655        movdqa  %xmm1,(%edi,%edx)
656        movdqa  %xmm2,16(%edi,%edx)
657        movdqa  %xmm3,32(%edi,%edx)
658        movdqa  %xmm4,48(%edi,%edx)
659
660        addl    $64,%edx
661        jnz     1b
662
663        jmp     Lshort                  // copy remaining 0..63 bytes and done
664
665
666// Forward loop for medium length operands in which low four bits of %esi == 1111
667
668LMod15:
669	movdqa	-15(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
6701:					// loop over 64-byte chunks
671        movdqa  1(%esi,%edx),%xmm1
672        movdqa  17(%esi,%edx),%xmm2
673        movdqa  33(%esi,%edx),%xmm3
674        movdqa  49(%esi,%edx),%xmm4
675
676	movdqa	%xmm0,%xmm5
677	movdqa	%xmm4,%xmm0
678
679	palignr	$15,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
680	palignr	$15,%xmm2,%xmm3
681	palignr	$15,%xmm1,%xmm2
682	palignr	$15,%xmm5,%xmm1
683
684        movdqa  %xmm1,(%edi,%edx)
685        movdqa  %xmm2,16(%edi,%edx)
686        movdqa  %xmm3,32(%edi,%edx)
687        movdqa  %xmm4,48(%edi,%edx)
688
689        addl    $64,%edx
690        jnz     1b
691
692        jmp     Lshort                  // copy remaining 0..63 bytes and done
693
694
695// Reverse moves.  These are not optimized as aggressively as their forward
696// counterparts, as they are only used with destructive overlap.
697//      ecx = length
698//      esi = source ptr
699//      edi = dest ptr
700
701LReverse:
702        addl    %ecx,%esi               // point to end of strings
703        addl    %ecx,%edi
704        cmpl    $(kShort),%ecx          // long enough to bother with SSE?
705        ja      LReverseNotShort        // yes
706
707// Handle reverse short copies.
708//      ecx = length
709//      esi = one byte past end of source
710//      edi = one byte past end of dest
711
712LReverseShort:
713	movl    %ecx,%edx		// copy length
714	shrl	$2,%ecx			// #words
715	jz	3f
7161:
717	subl	$4,%esi
718	movl	(%esi),%eax
719	subl	$4,%edi
720	movl	%eax,(%edi)
721	dec	%ecx
722	jnz	1b
7233:
724	andl	$3,%edx			// bytes?
725	jz	5f
7264:
727	dec	%esi
728	movb	(%esi),%al
729	dec	%edi
730	movb	%al,(%edi)
731	dec	%edx
732	jnz	4b
7335:
734        movl    8(%ebp),%eax		// get return value (dst ptr) for memcpy/memmove
735        popl    %edi
736        popl    %esi
737	popl	%ebp
738        ret
739
740// Handle a reverse move long enough to justify using SSE.
741//      ecx = length
742//      esi = one byte past end of source
743//      edi = one byte past end of dest
744
745LReverseNotShort:
746        movl    %edi,%edx               // copy destination
747        andl    $15,%edx                // get #bytes to align destination
748        je      LReverseDestAligned     // already aligned
749        subl	%edx,%ecx		// adjust length
7501:					// loop copying 1..15 bytes
751	dec	%esi
752	movb	(%esi),%al
753	dec	%edi
754	movb	%al,(%edi)
755	dec	%edx
756	jnz	1b
757
758// Destination is now aligned.  Prepare for reverse loops.
759
760LReverseDestAligned:
761        movl    %ecx,%edx               // copy length
762        andl    $63,%ecx                // get remaining bytes for Lshort
763        andl    $-64,%edx               // get number of bytes we will copy in inner loop
764        subl    %edx,%esi               // point to endpoint of copy
765        subl    %edx,%edi
766	testl	$15,%esi		// is source aligned too?
767        jnz     LReverseUnalignedLoop   // no
768
769LReverseAlignedLoop:                    // loop over 64-byte chunks
770        movdqa  -16(%esi,%edx),%xmm0
771        movdqa  -32(%esi,%edx),%xmm1
772        movdqa  -48(%esi,%edx),%xmm2
773        movdqa  -64(%esi,%edx),%xmm3
774
775        movdqa  %xmm0,-16(%edi,%edx)
776        movdqa  %xmm1,-32(%edi,%edx)
777        movdqa  %xmm2,-48(%edi,%edx)
778        movdqa  %xmm3,-64(%edi,%edx)
779
780        subl    $64,%edx
781        jne     LReverseAlignedLoop
782
783        jmp     LReverseShort           // copy remaining 0..63 bytes and done
784
785
786// Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
787
788LReverseUnalignedLoop:                  // loop over 64-byte chunks
789        movdqu  -16(%esi,%edx),%xmm0
790        movdqu  -32(%esi,%edx),%xmm1
791        movdqu  -48(%esi,%edx),%xmm2
792        movdqu  -64(%esi,%edx),%xmm3
793
794        movdqa  %xmm0,-16(%edi,%edx)
795        movdqa  %xmm1,-32(%edi,%edx)
796        movdqa  %xmm2,-48(%edi,%edx)
797        movdqa  %xmm3,-64(%edi,%edx)
798
799        subl    $64,%edx
800        jne     LReverseUnalignedLoop
801
802        jmp     LReverseShort           // copy remaining 0..63 bytes and done
803
804
805	COMMPAGE_DESCRIPTOR(bcopy_sse3x,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2)
806