1/*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24
25// ***************
26// * S T R C P Y *
27// ***************
28//
29// char  *strcpy(const char *dst, const char *src);
30//
31// We optimize the move by doing it vector parallel.  This introduces
32// a complication: if we blindly did vector load/stores until finding
33// a 0, we might get a spurious page fault by touching bytes past it.
34// To avoid this, we never do a load that crosses a page boundary,
35// and never store a byte we don't have to.
36//
37// We align the destination, because unaligned vector stores are slow.
38
39        .text
40        .globl _strcpy
41
42        .align 	4
43_strcpy:				// char *strcpy(const char *dst, const char *src);
44	pushl	%edi
45	movl	8(%esp),%edi		// get dest ptr
46	movl	12(%esp),%ecx		// get source ptr
47	movl	%edi,%edx		// copy dest ptr
48	negl	%edx
49	andl	$15,%edx		// how many bytes to align dest ptr?
50	jnz	LLoopOverBytes		// not aligned, so go do so
51
52
53// In order to avoid spurious page faults, we loop until nearing the source page
54// end.  Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
55// then resume the vector loop.
56//	%ecx = source ptr (unaligned)
57//	%edi = dest ptr (aligned)
58
59LNextChunk:
60	movl	%ecx,%eax		// copy source ptr
61	movl	$4096,%edx
62	andl	$4095,%eax		// get offset into source page
63	subl	%eax,%edx		// get #bytes remaining in source page
64	shrl	$4,%edx			// get #chunks till end of page
65	jnz	LLoopOverChunks		// enter vector loop
66	movl	$16,%edx		// move 16 bytes to cross page but keep dest aligned
67	jmp	LLoopOverBytes
68
69
70// Loop over bytes.
71//	%ecx = source ptr
72//	%edi = dest ptr
73//	%edx = byte count
74
75	.align	4,0x90			// align inner loops to optimize I-fetch
76LLoopOverBytes:
77	movzb	(%ecx),%eax		// get source byte
78	inc	%ecx
79	movb	%al,(%edi)		// pack into dest
80	inc	%edi
81	testl	%eax,%eax		// 0?
82	jz	LDone			// yes, we're done
83	dec	%edx			// more to go?
84	jnz	LLoopOverBytes
85
86	jmp	LNextChunk		// we've come to end of page
87
88
89// Loop over 16-byte chunks.
90//	%ecx = source ptr (unaligned)
91//	%edi = dest ptr (aligned)
92//	%edx = chunk count
93
94	.align	4,0x90			// align inner loops to optimize I-fetch
95LLoopOverChunks:
96	movdqu	(%ecx),%xmm1		// get source
97	pxor	%xmm0,%xmm0		// get some 0s
98	addl	$16,%ecx
99	pcmpeqb	%xmm1,%xmm0		// compare source to 0s
100	pmovmskb %xmm0,%eax		// get result mask for 0 check
101	testl	%eax,%eax		// any 0s?
102	jnz	LFound0			// yes, exit loop
103	movdqa	%xmm1,(%edi)		// no 0s so do aligned store into destination
104	addl	$16,%edi
105	dec	%edx			// more to go?
106	jnz	LLoopOverChunks
107
108	movl	$16,%edx		// move 16 bytes
109	jmp	LLoopOverBytes		// cross page but keep dest aligned
110
111
112// Found a zero in the vector.  Figure out where it is, and store the bytes
113// up to it.
114//	%edi = dest ptr (aligned)
115//	%eax = result mask
116//	%xmm1 = source vector
117
118LFound0:
119	bsf	%eax,%edx		// find first 0
120	inc	%edx			// we need to store the 0 too
121	test	$16,%dl			// was 0 last byte?
122	jz	8f			// no
123	movdqa	%xmm1,(%edi)		// yes, store entire vector
124	jmp	LDone
1258:
126	test	$8,%dl			// 8-byte store required?
127	jz	4f			// no
128	movq	%xmm1,(%edi)		// pack in 8 low bytes
129	psrldq	$8,%xmm1		// then shift vector down 8 bytes
130	addl	$8,%edi
1314:
132	test	$4,%dl			// 4-byte store required?
133	jz	3f			// no
134	movd	%xmm1,(%edi)		// pack in 4 low bytes
135	psrldq	$4,%xmm1		// then shift vector down 4 bytes
136	addl	$4,%edi
1373:
138	andl	$3,%edx			// more to go?
139	jz	LDone			// no
140	movd	%xmm1,%eax		// move remainders out of vector into %eax
1411:					// loop on up to three bytes
142	movb	%al,(%edi)		// pack in next byte
143	shrl	$8,%eax			// shift next byte into position
144	inc	%edi
145	dec	%edx
146	jnz	1b
147
148LDone:
149	movl	8(%esp),%eax		// original dest ptr is return value
150	popl	%edi
151	ret
152