1/*
2 * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <machine/cpu_capabilities.h>
30#include <machine/commpage.h>
31
32/*
33 * The bcopy/memcpy loops, tuned for Nehalem.  This is the 64-bit version.
34 *
35 * The following #defines are tightly coupled to the u-architecture:
36 */
37
38#define kShort  80			// too short to bother with SSE (must be >=80)
39
40
41// void bcopy(const void *src, void *dst, size_t len);
42
43        .text
44	.code64
45        .align 5, 0x90
46Lbcopy_sse42_64:				// void bcopy(const void *src, void *dst, size_t len)
47	pushq	%rbp			// set up a frame for backtraces
48	movq	%rsp,%rbp
49	movq	%rsi,%rax		// copy dest ptr
50	movq	%rdi,%rsi		// xchange source and dest ptrs
51	movq	%rax,%rdi
52        subq    %rsi,%rax               // (dest - source)
53        cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
54        jb      LReverseIsland
55        cmpq    $(kShort),%rdx          // long enough to bother with SSE?
56        jbe     LShort			// no
57	jmp	LNotShort
58
59//
60// void *memcpy(void *dst, const void *src, size_t len);
61// void *memmove(void *dst, const void *src, size_t len);
62//
63// NB: These need to be 32 bytes from bcopy():
64//
65
66        .align	5, 0x90
67Lmemcpy:				// void *memcpy(void *dst, const void *src, size_t len)
68Lmemmove:				// void *memmove(void *dst, const void *src, size_t len)
69	pushq	%rbp			// set up a frame for backtraces
70	movq	%rsp,%rbp
71	movq	%rdi,%r11		// save return value here
72        movq    %rdi,%rax
73        subq    %rsi,%rax               // (dest - source)
74        cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
75        jb      LReverseIsland
76        cmpq    $(kShort),%rdx          // long enough to bother with SSE?
77        ja      LNotShort               // yes
78
79// Handle short forward copies.  As the most common case, this is the fall-through path.
80//      rdx = length (<= kShort)
81//      rsi = source ptr
82//      rdi = dest ptr
83
84LShort:
85	movl    %edx,%ecx		// copy length using 32-bit operation
86	shrl	$2,%ecx			// get #doublewords
87	jz	3f
882:					// loop copying doublewords
89	movl	(%rsi),%eax
90	addq	$4,%rsi
91	movl	%eax,(%rdi)
92	addq	$4,%rdi
93	decl	%ecx
94	jnz	2b
953:					// handle leftover bytes (0..3) in last word
96	andl	$3,%edx			// any leftover bytes?
97	jz	5f
984:					// loop copying bytes
99	movb	(%rsi),%al
100	incq	%rsi
101	movb	%al,(%rdi)
102	incq	%rdi
103	decl	%edx
104	jnz	4b
1055:
106        movq	%r11,%rax		// get return value (dst ptr) for memcpy/memmove
107	popq	%rbp
108        ret
109
110
111LReverseIsland:				// keep the "jb" above a short branch...
112	jmp	LReverse		// ...because reverse moves are uncommon
113
114
115// Handle forward moves that are long enough to justify use of SSE.
116// First, 16-byte align the destination.
117//      rdx = length (> kShort)
118//      rsi = source ptr
119//      rdi = dest ptr
120
121LNotShort:
122        movl    %edi,%ecx               // copy low half of destination ptr
123        negl    %ecx
124        andl    $15,%ecx                // get #bytes to align destination
125	jz	LDestAligned		// already aligned
126        subl    %ecx,%edx               // decrement length
1271:					// loop copying 1..15 bytes
128	movb	(%rsi),%al
129	inc	%rsi
130	movb	%al,(%rdi)
131	inc	%rdi
132	dec	%ecx
133	jnz	1b
134
135
136// Destination is now aligned.  Nehalem does a great job with unaligned SSE loads,
137// so we use MOVDQU rather than aligned loads and shifts.  Since kShort>=80, we
138// know there is at least one 64-byte chunk to move.
139// When we enter the copy loops, the following registers are set up:
140//      rdx = residual length (0..63)
141//	rcx = -(length to move), a multiple of 64 less than 2GB
142//      rsi = ptr to 1st source byte not to move (unaligned)
143//      rdi = ptr to 1st dest byte not to move (aligned)
144
145LDestAligned:
146        movq    %rdx,%rcx               // copy length
147        andl    $63,%edx                // get remaining bytes for LShort
148        andq    $-64,%rcx               // get number of bytes we will copy in inner loop
149        addq    %rcx,%rsi               // point to 1st byte not copied
150        addq    %rcx,%rdi
151        negq    %rcx                    // now generate offset to 1st byte to be copied
152	testl	$15,%esi		// source also aligned?
153	jnz	LUnalignedLoop
154	jmp	LAlignedLoop
155
156
157// Forward loop for aligned operands.
158
159	.align	4,0x90			// 16-byte align inner loops
160LAlignedLoop:				// loop over 64-byte chunks
161        movdqa  (%rsi,%rcx),%xmm0
162        movdqa  16(%rsi,%rcx),%xmm1
163        movdqa  32(%rsi,%rcx),%xmm2
164        movdqa  48(%rsi,%rcx),%xmm3
165
166        movdqa  %xmm0,(%rdi,%rcx)
167        movdqa  %xmm1,16(%rdi,%rcx)
168        movdqa  %xmm2,32(%rdi,%rcx)
169        movdqa  %xmm3,48(%rdi,%rcx)
170
171        addq    $64,%rcx
172        jnz     LAlignedLoop
173
174        jmp     LShort                  // copy remaining 0..63 bytes and done
175
176
177// Forward loop for unaligned operands.
178
179	.align	4,0x90			// 16-byte align inner loops
180LUnalignedLoop:				// loop over 64-byte chunks
181        movdqu  (%rsi,%rcx),%xmm0
182        movdqu  16(%rsi,%rcx),%xmm1
183        movdqu  32(%rsi,%rcx),%xmm2
184        movdqu  48(%rsi,%rcx),%xmm3
185
186        movdqa  %xmm0,(%rdi,%rcx)
187        movdqa  %xmm1,16(%rdi,%rcx)
188        movdqa  %xmm2,32(%rdi,%rcx)
189        movdqa  %xmm3,48(%rdi,%rcx)
190
191        addq    $64,%rcx
192        jnz     LUnalignedLoop
193
194        jmp     LShort                  // copy remaining 0..63 bytes and done
195
196
197// Reverse moves.  These are only used with destructive overlap.
198//      rdx = length
199//      rsi = source ptr
200//      rdi = dest ptr
201
202LReverse:
203        addq    %rdx,%rsi               // point to end of strings
204        addq    %rdx,%rdi
205        cmpq    $(kShort),%rdx          // long enough to bother with SSE?
206        ja      LReverseNotShort        // yes
207
208// Handle reverse short copies.
209//      edx = length (<= kShort)
210//      rsi = one byte past end of source
211//      rdi = one byte past end of dest
212
213LReverseShort:
214	movl    %edx,%ecx		// copy length
215	shrl	$3,%ecx			// #quadwords
216	jz	3f
2171:
218	subq	$8,%rsi
219	movq	(%rsi),%rax
220	subq	$8,%rdi
221	movq	%rax,(%rdi)
222	decl	%ecx
223	jnz	1b
2243:
225	andl	$7,%edx			// bytes?
226	jz	5f
2274:
228	decq	%rsi
229	movb	(%rsi),%al
230	decq	%rdi
231	movb	%al,(%rdi)
232	decl	%edx
233	jnz	4b
2345:
235        movq	%r11,%rax		// get return value (dst ptr) for memcpy/memmove
236	popq	%rbp
237        ret
238
239// Handle a reverse move long enough to justify using SSE.
240//      rdx = length (> kShort)
241//      rsi = one byte past end of source
242//      rdi = one byte past end of dest
243
244LReverseNotShort:
245        movl    %edi,%ecx               // copy destination
246        andl    $15,%ecx                // get #bytes to align destination
247        jz      LReverseDestAligned     // already aligned
248        subq	%rcx,%rdx		// adjust length
2491:					// loop copying 1..15 bytes
250	decq	%rsi
251	movb	(%rsi),%al
252	decq	%rdi
253	movb	%al,(%rdi)
254	decl	%ecx
255	jnz	1b
256
257// Destination is now aligned.  Prepare for reverse loops.
258
259LReverseDestAligned:
260        movq    %rdx,%rcx               // copy length
261        andl    $63,%edx                // get remaining bytes for LReverseShort
262        andq    $-64,%rcx               // get number of bytes we will copy in inner loop
263        subq    %rcx,%rsi               // point to endpoint of copy
264        subq    %rcx,%rdi
265	testl	$15,%esi		// is source aligned too?
266        jnz     LReverseUnalignedLoop   // no
267
268LReverseAlignedLoop:                    // loop over 64-byte chunks
269        movdqa  -16(%rsi,%rcx),%xmm0
270        movdqa  -32(%rsi,%rcx),%xmm1
271        movdqa  -48(%rsi,%rcx),%xmm2
272        movdqa  -64(%rsi,%rcx),%xmm3
273
274        movdqa  %xmm0,-16(%rdi,%rcx)
275        movdqa  %xmm1,-32(%rdi,%rcx)
276        movdqa  %xmm2,-48(%rdi,%rcx)
277        movdqa  %xmm3,-64(%rdi,%rcx)
278
279        subq    $64,%rcx
280        jne     LReverseAlignedLoop
281
282        jmp     LReverseShort           // copy remaining 0..63 bytes and done
283
284
285// Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
286
287LReverseUnalignedLoop:                  // loop over 64-byte chunks
288        movdqu  -16(%rsi,%rcx),%xmm0
289        movdqu  -32(%rsi,%rcx),%xmm1
290        movdqu  -48(%rsi,%rcx),%xmm2
291        movdqu  -64(%rsi,%rcx),%xmm3
292
293        movdqa  %xmm0,-16(%rdi,%rcx)
294        movdqa  %xmm1,-32(%rdi,%rcx)
295        movdqa  %xmm2,-48(%rdi,%rcx)
296        movdqa  %xmm3,-64(%rdi,%rcx)
297
298        subq    $64,%rcx
299        jne     LReverseUnalignedLoop
300
301        jmp     LReverseShort           // copy remaining 0..63 bytes and done
302
303
304	COMMPAGE_DESCRIPTOR(bcopy_sse42_64,_COMM_PAGE_BCOPY,kHasSSE4_2,0)
305