1/*
2 * Copyright (c) 2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24
25// *****************
26// * S T R L C P Y *
27// *****************
28//
29// size_t  strlcpy(char *dst, const char *src, size_t size);
30//
31// We optimize the move by doing it word parallel.  This introduces
32// a complication: if we blindly did word load/stores until finding
33// a 0, we might get a spurious page fault by touching bytes past it.
34// To avoid this, we never do a load that crosses a page boundary,
35// or store unnecessary bytes.
36//
37// The test for 0s relies on the following inobvious but very efficient
38// word-parallel test:
39//		x =  dataWord + 0xFEFEFEFF
40//		y = ~dataWord & 0x80808080
41//		if (x & y) == 0 then no zero found
42// The test maps any non-zero byte to zero, and any zero byte to 0x80,
43// with one exception: 0x01 bytes preceeding the first zero are also
44// mapped to 0x80.
45//
46// On Core2 class machines, this word-parallel implementation seems to
47// be slightly faster than using SSE up to about 100 bytes.
48// It is faster than the naive byte-by-byte implementation for
49// operands longer than about 8 bytes.
50
51        .text
52        .globl _strlcpy
53
54        .align 	4
55_strlcpy:				// size_t *strlcpy(char *dst, const char *src, size_t size);
56	pushl	%edi
57	pushl	%esi
58	pushl	%ebx
59	movl	16(%esp),%edi		// get dest ptr
60	movl	20(%esp),%esi		// get source ptr
61	movl	24(%esp),%ecx		// get length of buffer
62	movl	%esi,%edx		// copy source ptr
63	negl	%edx
64	andl	$3,%edx			// how many bytes to align source ptr?
65	jz	LAligned		// already aligned
66
67
68// Loop over bytes.
69//	%edi = dest ptr
70//	%esi = source ptr
71//	%ecx = length remaining in buffer
72//	%edx = number of bytes to copy (>0, may not fit in buffer)
73
74LLoopOverBytes:
75	movzb	(%esi),%eax		// get source byte before checking buffer length
76	testl	%ecx,%ecx		// buffer full?
77	jz	L0NotFound		// yes
78	inc	%esi
79	dec	%ecx
80	movb	%al,(%edi)		// pack into dest
81	inc	%edi
82	testl	%eax,%eax		// 0?
83	jz	LDone			// yes, done
84	dec	%edx			// more to go?
85	jnz	LLoopOverBytes
86
87
88// Source is aligned.  Loop over words until end of buffer.  We
89// align the source, rather than the dest, to avoid getting spurious page faults.
90//	%edi = dest ptr (unaligned)
91//	%esi = source ptr (word aligned)
92//	%ecx = length remaining in buffer
93
94LAligned:
95	movl	$5,%edx			// if buffer almost exhausted, prepare to copy rest byte-by-byte
96	cmpl	$4,%ecx			// enough for at least one word?
97	jb	LLoopOverBytes
98
99
100// Loop over words.
101//	%edi = dest ptr (unaligned)
102//	%esi = source ptr (word aligned)
103//	%ecx = length remaining in buffer (>=4)
104
105LLoopOverWords:
106	movl	(%esi),%eax		// get next 4 bytes of source
107	subl	$4,%ecx
108	addl	$4,%esi
109	movl	%eax,%edx		// make 2 copies of word
110	movl	%eax,%ebx
111	notl	%edx			// use magic word-parallel test for 0s
112	addl	$0xFEFEFEFF,%ebx
113	andl	$0x80808080,%edx
114	testl	%ebx,%edx
115	jnz	L0Found			// one of the bytes of %eax is a 0
116	movl	%eax,(%edi)		// pack 4 bytes into destination
117	addl	$4,%edi
118	cmpl	$4,%ecx			// room in buffer for another word?
119	jae	LLoopOverWords		// yes
120
121	movl	%ecx,%edx		// copy leftovers in byte loop
122	jmp	LLoopOverBytes
123
124// Found a 0-byte in the word of source.  Store a byte at a time until the 0.
125//	%edi = dest ptr (unaligned)
126//	%eax = last word of source, known to have a 0-byte
127
128LNextByte:
129	shrl	$8,%eax			// next byte
130L0Found:
131	movb	%al,(%edi)		// pack in next byte
132	incl	%edi
133	testb	%al,%al			// 0?
134	jnz	LNextByte
135
136// Done storing string.
137//	%edi = ptr to byte after 0-byte
138
139LDone:
140	subl	16(%esp),%edi		// subtract original dest ptr to get length stored
141	decl	%edi			// don't count the 0-byte
142	movl	%edi,%eax		// copy to return value
143LExit:
144	popl	%ebx
145	popl	%esi
146	popl	%edi
147	ret
148
149// Buffer filled but 0-byte not found.  We return the length of the source string.
150// This is not optimized, as it is an error condition.
151//	%edi = dest ptr (ie, 1 past end of buffer)
152//	%esi = source ptr (ptr to 1st byte that does not fit)
153
154L0NotFound:
155	movl	24(%esp),%eax		// reload buffer length
156	testl	%eax,%eax		// null?
157	jz	1f			// yes, cannot store a 0
158	xorl	%edx,%edx		// get a 0
159	movb	%dl,-1(%edi)		// store a 0 at end of buffer to delimit string
1601:
161	movzb	(%esi),%edx		// get next byte of source
162	incl	%esi
163	incl	%eax
164	testl	%edx,%edx		// 0?
165	jnz	1b
166	decl	%eax			// don't count the 0-byte
167	jmp	LExit
168