1/*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24#include <machine/cpu_capabilities.h>
25
26
27// *****************
28// * S T R N C P Y *
29// *****************
30//
31// char  *strncpy(const char *dst, const char *src, size_t n);
32//
33// We optimize the move by doing it vector parallel.  This introduces
34// a complication: if we blindly did vector load/stores until finding
35// a 0, we might get a spurious page fault by touching bytes past it.
36// To avoid this, we never do a load that crosses a page boundary,
37// and never store a byte we don't have to.
38//
39// We align the destination, because unaligned vector stores are slow.
40//
41// Recall that strncpy() zero fills the remainder of the dest buffer,
42// and does not terminate the string if it's length is greater than or
43// equal to n.
44
45#define	kShort	31			// too short to bother with vector loop
46
47        .text
48        .globl _strncpy
49
50        .align 	4
51_strncpy:				// char  *strncpy(const char *dst, const char *src, size_t n);
52	pushl	%edi
53	pushl	%esi
54	movl	12(%esp),%edi		// get dest ptr
55	movl	16(%esp),%esi		// get source ptr
56	movl	20(%esp),%ecx		// get length
57	movl	%edi,%edx		// copy dest ptr
58	negl	%edx
59	andl	$15,%edx		// how many bytes to align dest ptr?
60	jnz	LCheckShortCopy		// align destination first
61
62
63// In order to avoid spurious page faults, we loop until nearing the source page
64// end.  Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
65// then resume the vector loop.
66//	%esi = source ptr (unaligned)
67//	%edi = dest ptr (aligned)
68//	%ecx = buffer length remaining
69
70LNextChunk:				// NB: can drop down to here
71	movl	%esi,%eax		// copy source ptr
72	movl	$4096,%edx
73	andl	$4095,%eax		// get offset into source page
74	subl	%eax,%edx		// get #bytes remaining in source page
75	cmpl	%ecx,%edx		// will buffer run out before the page end?
76	cmova	%ecx,%edx		// get min(length remaining, bytes to page end)
77	shrl	$4,%edx			// get #chunks till end of page
78	jnz	LLoopOverChunks		// enter vector loop
79
80// We can't use the chunk loop yet.  Check for short and empty buffers, then use byte loop.
81
82LCrossPage:				// if buffer is large enough, cross source page
83	movl	$16,%edx		// move 16 bytes to cross page but keep dest aligned
84LCheckShortCopy:			// we propose to copy %edx bytes in byte loop
85	cmpl	$(kShort),%ecx		// much left?
86	ja	LLoopOverBytes		// yes, loop over bytes then more chunks
87	movl	%ecx,%edx		// no, use the byte loop for everything
88	testl	%ecx,%ecx		// have we filled buffer?
89	jnz	LLoopOverBytes		// no
90	jmp	LDone
91
92
93// Loop over bytes.
94//	%esi = source ptr
95//	%edi = dest ptr
96//	%ecx = buffer length remaining
97//	%edx = count of bytes to loop over (<= buffer length)
98
99	.align	4,0x90			// align inner loops to optimize I-fetch
100LLoopOverBytes:
101	movzb	(%esi),%eax		// get source byte
102	inc	%esi
103	dec	%ecx			// decrement length
104	movb	%al,(%edi)		// pack into dest
105	inc	%edi
106	testl	%eax,%eax		// 0?
107	jz	LZeroBuffer		// yes, we're done copying string
108	dec	%edx			// more to go?
109	jnz	LLoopOverBytes
110
111	testl	%ecx,%ecx		// at end of buffer?
112	jnz	LNextChunk		// no, xfer chunks
113	jmp	LDone			// yes
114
115
116// Loop over 16-byte chunks.
117//	%esi = source ptr (unaligned)
118//	%edi = dest ptr (aligned)
119//	%ecx = buffer length remaining
120//	%edx = chunk count
121
122	.align	4,0x90			// align inner loops to optimize I-fetch
123LLoopOverChunks:
124	movdqu	(%esi),%xmm1		// get source
125	pxor	%xmm0,%xmm0		// get some 0s
126	addl	$16,%esi
127	pcmpeqb	%xmm1,%xmm0		// compare source to 0s
128	pmovmskb %xmm0,%eax		// get result mask for 0 check
129	testl	%eax,%eax		// any 0s?
130	jnz	LFound0			// yes, exit loop
131	movdqa	%xmm1,(%edi)		// no 0s so do aligned store into destination
132	addl	$16,%edi
133	subl	$16,%ecx		// decrement length remaining
134	dec	%edx			// more to go?
135	jnz	LLoopOverChunks
136
137	jmp	LCrossPage		// cross page but keep dest aligned
138
139
140// Found a zero in the vector.  Figure out where it is, and store the bytes
141// up to it.  It is possible that we should check to be sure (%ecx >= 16), and
142// just do an aligned store of %xmm1 if so.  But if we did, we'd be doing byte
143// stores into the same double quadword in bzero(), which might hit a hazard.
144// Experimentation needed.
145//	%edi = dest ptr (aligned)
146//	%eax = result mask
147//	%ecx = buffer length remaining
148//	%xmm1 = source vector
149
150LFound0:
151	bsf	%eax,%edx		// find first 0
152	subl	%edx,%ecx		// decrement remaining buffer length
153	test	$8,%dl			// 8-byte store required?
154	jz	4f			// no
155	movq	%xmm1,(%edi)		// pack in 8 low bytes
156	psrldq	$8,%xmm1		// then shift vector down 8 bytes
157	addl	$8,%edi
1584:
159	test	$4,%dl			// 4-byte store required?
160	jz	3f			// no
161	movd	%xmm1,(%edi)		// pack in 4 low bytes
162	psrldq	$4,%xmm1		// then shift vector down 4 bytes
163	addl	$4,%edi
1643:
165	andl	$3,%edx			// more to go?
166	jz	LZeroBuffer		// no
167	movd	%xmm1,%eax		// move remainders out of vector into %eax
1681:					// loop on up to three bytes
169	movb	%al,(%edi)		// pack in next byte
170	shrl	$8,%eax			// shift next byte into position
171	inc	%edi
172	dec	%edx
173	jnz	1b
174
175// We've copied the string.  Now zero the rest of the buffer, using commpage bzero().
176//	%edi = dest ptr
177//	%ecx = buffer length remaining
178
179LZeroBuffer:
180//	The stack currently is aligned to 4 mod 16 (it was 0 mod 16 at the time of
181//	the call, and the return address, edi, and esi have been pushed).  It needs
182//	to aligned 0 mod 16 when we call bzero, so we subtract 20 from esp (not 4
183//	because we need to have 8 bytes for the arguments to bzero).
184	subl	$20,%esp
185	movl	%ecx,4(%esp)	// remaining buffer size
186	movl	%edi, (%esp)	// pointer to first unstored byte
187	call	_bzero
188	addl	$20,%esp
189
190LDone:
191	movl	12(%esp),%eax		// original dest ptr is return value
192	popl	%esi
193	popl	%edi
194	ret
195