1/*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24#include <machine/cpu_capabilities.h>
25
26
27// *****************
28// * S T R N C P Y *
29// *****************
30//
31// char  *strncpy(const char *dst, const char *src, size_t n);
32//
33// We optimize the move by doing it vector parallel.  This introduces
34// a complication: if we blindly did vector load/stores until finding
35// a 0, we might get a spurious page fault by touching bytes past it.
36// To avoid this, we never do a load that crosses a page boundary,
37// and never store a byte we don't have to.
38//
39// We align the destination, because unaligned vector stores are slow.
40//
41// Recall that strncpy() zero fills the remainder of the dest buffer,
42// and does not terminate the string if its length is greater than or
43// equal to n.
44
45#define	kShort	31			// too short to bother with vector loop
46
47        .text
48        .globl _strncpy
49
50        .align 	4
51_strncpy:				// char  *strncpy(const char *dst, const char *src, size_t n);
52	movq	%rdi,%r8	// preserve destination pointer so we can return it
53	movl	%edi,%ecx		// copy low 4 bytes of dest ptr
54	negl	%ecx
55	andl	$15,%ecx		// how many bytes to align dest ptr?
56	jnz	LCheckShortCopy		// align destination first
57
58
59// In order to avoid spurious page faults, we loop until nearing the source page
60// end.  Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
61// then resume the vector loop.
62//	%rsi = source ptr (unaligned)
63//	%rdi = dest ptr (aligned)
64//	%rdx = buffer length remaining
65
66LNextChunk:				// NB: can drop down to here
67	movl	%esi,%eax		// copy the low 4 bytes of the source ptr
68	movl	$4096,%ecx
69	andl	$4095,%eax		// get offset into source page
70	subl	%eax,%ecx		// get #bytes remaining in source page
71	cmpq	%rdx,%rcx		// will buffer run out before the page end?
72	cmova	%rdx,%rcx		// get min(length remaining, bytes to page end)
73	shrl	$4,%ecx			// get #chunks till end of page
74	jnz	LLoopOverChunks		// enter vector loop
75
76// We can't use the chunk loop yet.  Check for short and empty buffers, then use byte loop.
77
78LCrossPage:				// if buffer is large enough, cross source page
79	movl	$16,%ecx		// move 16 bytes to cross page but keep dest aligned
80LCheckShortCopy:			// we propose to copy %ecx bytes in byte loop
81	cmpq	$(kShort),%rdx		// much left?
82	ja	LLoopOverBytes		// yes, loop over bytes then more chunks
83	movl	%edx,%ecx		// no, use the byte loop for everything
84	testl	%edx,%edx		// have we filled buffer?
85	jnz	LLoopOverBytes		// no
86	jmp	LDone
87
88
89// Loop over bytes.
90//	%rsi = source ptr
91//	%rdi = dest ptr
92//	%rdx = buffer length remaining
93//	%rcx = count of bytes to loop over (<= buffer length)
94
95	.align	4,0x90			// align inner loops to optimize I-fetch
96LLoopOverBytes:
97	movzb	(%rsi),%eax		// get source byte
98	addq	$1,%rsi
99	subq	$1,%rdx			// decrement length
100	movb	%al,(%rdi)		// pack into dest
101	addq	$1,%rdi
102	testl	%eax,%eax		// 0?
103	jz	LZeroBuffer		// yes, we're done copying string
104	subq	$1,%rcx			// more to go?
105	jnz	LLoopOverBytes
106
107	testq	%rdx,%rdx		// at end of buffer?
108	jnz	LNextChunk		// no, xfer chunks
109	jmp	LDone			// yes
110
111
112// Loop over 16-byte chunks.
113//	%rsi = source ptr (unaligned)
114//	%rdi = dest ptr (aligned)
115//	%rdx = buffer length remaining
116//	%ecx = chunk count
117
118	.align	4,0x90			// align inner loops to optimize I-fetch
119LLoopOverChunks:
120	movdqu	(%rsi),%xmm1		// get source
121	pxor	%xmm0,%xmm0		// get some 0s
122	addq	$16,%rsi
123	pcmpeqb	%xmm1,%xmm0		// compare source to 0s
124	pmovmskb %xmm0,%eax		// get result mask for 0 check
125	testl	%eax,%eax		// any 0s?
126	jnz	LFound0			// yes, exit loop
127	movdqa	%xmm1,(%rdi)		// no 0s so do aligned store into destination
128	addq	$16,%rdi
129	subq	$16,%rdx		// decrement length remaining
130	subl	$1,%ecx			// more to go?
131	jnz	LLoopOverChunks
132
133	jmp	LCrossPage		// cross page but keep dest aligned
134
135
136// Found a zero in the vector.  Figure out where it is, and store the bytes
137// up to it.  It is possible that we should check to be sure (%rdx >= 16), and
138// just do an aligned store of %xmm1 if so.  But if we did, we'd be doing byte
139// stores into the same double quadword in bzero(), which might hit a hazard.
140// Experimentation needed.
141//	%rdi = dest ptr (aligned)
142//	%eax = result mask
143//	%rdx = buffer length remaining
144//	%xmm1 = source vector
145
146LFound0:
147	bsf	%eax,%ecx		// find first 0
148	subq	%rcx,%rdx		// decrement remaining buffer length
149	test	$8,%cl			// 8-byte store required?
150	jz	4f			// no
151	movq	%xmm1,(%rdi)		// pack in 8 low bytes
152	psrldq	$8,%xmm1		// then shift vector down 8 bytes
153	addq	$8,%rdi
1544:
155	test	$4,%cl			// 4-byte store required?
156	jz	3f			// no
157	movd	%xmm1,(%rdi)		// pack in 4 low bytes
158	psrldq	$4,%xmm1		// then shift vector down 4 bytes
159	addq	$4,%rdi
1603:
161	andl	$3,%ecx			// more to go?
162	jz	LZeroBuffer		// no
163	movd	%xmm1,%eax		// move remainders out of vector into %eax
1641:					// loop on up to three bytes
165	movb	%al,(%rdi)		// pack in next byte
166	shrl	$8,%eax			// shift next byte into position
167	addq	$1,%rdi
168	subl	$1,%ecx
169	jnz	1b
170
171// We've copied the string.  Now zero the rest of the buffer, using commpage bzero().
172//	%rdi = dest ptr
173//	%rcx = buffer length remaining
174
175LZeroBuffer:
176	movq	%rdx,%rsi		// remaining buffer size (2nd argument)
177	subq	$8,%rsp			// align stack to 16B before call
178	call	_bzero
179	addq	$8,%rsp			// restore stack
180
181LDone:
182	movq	%r8,%rax		// original dest ptr is return value
183	ret
184