1/*
2 * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <machine/cpu_capabilities.h>
30#include <machine/commpage.h>
31
32/*
33 * Bzero, tuned for processors with SSE4.2 and 64-byte cache lines, ie Nehalem.
34 * We don't actually use SSE4.2, but rather use it to identify Nehalem.
35 *
36 * We do not use nontemporal operations, but use MOVDQA in preference to REP/STOS.
37 *
38 * This routine is also used for memset(p,0,n), which is a common case
39 * since gcc sometimes silently maps bzero() into memset().  As a result,
40 * we always load the original ptr into %eax before returning.
41 */
42
43#define kShort		80		// too short to bother with SSE (must be >=80)
44
45
46        .text
47        .align  5, 0x90
48Lbzero_sse42:                            // void	bzero(void *b, size_t len);
49	pushl	%ebp			// set up a frame for backtraces
50	movl	%esp,%ebp
51        pushl   %edi
52        movl    8(%ebp),%edi            // get ptr
53        movl    12(%ebp),%edx           // get length
54
55        xorl    %eax,%eax               // set fill data to 0
56        cmpl    $(kShort),%edx          // long enough for SSE?
57        jg	LNotShort               // yes
58
59// Here for short operands or the end of long ones.
60//      %edx = length
61//      %edi = ptr
62//      %eax = zero
63
64Lshort:
65	cmpl	$12,%edx		// long enough to word align?
66	jge	3f			// yes
67	test	%edx,%edx		// length==0?
68	jz	6f
691:
70	movb	%al,(%edi)		// zero a byte
71	inc	%edi
72	dec	%edx
73	jnz	1b
74	jmp	6f
752:
76	movb	%al,(%edi)		// zero a byte
77	inc	%edi
78	dec	%edx
793:
80	test	$3,%edi			// is ptr doubleword aligned?
81	jnz	2b			// no
82	movl	%edx,%ecx		// copy length
83	shrl	$2,%edx			// #doublewords to store
844:
85	movl	%eax,(%edi)		// zero an aligned doubleword
86	addl	$4,%edi
87	dec	%edx
88	jnz	4b
89	andl	$3,%ecx			// mask down to #bytes at end (0..3)
90	jz	6f			// none
915:
92	movb	%al,(%edi)		// zero a byte
93	inc	%edi
94	dec	%ecx
95	jnz	5b
966:
97	movl	8(%ebp),%eax		// get return value in case this was a call of memset()
98        popl    %edi
99	popl	%ebp
100        ret
101
102
103// We will be using SSE, so align ptr.
104//      %edx = length
105//      %edi = ptr
106//      %eax = zero
107
108LNotShort:
109	testl	$3,%edi			// 4-byte aligned?
110	jz	2f			// yes
111	movb	%al,(%edi)		// zero another byte
112	incl	%edi
113	decl	%edx
114	jmp	LNotShort
1151:					// zero doublewords until 16-byte aligned
116	movl	%eax,(%edi)
117	addl	$4,%edi
118	subl	$4,%edx
1192:
120	testl	$15,%edi		// 16-byte aligned?
121	jnz	1b			// no
122
123
124// Destination is now 16-byte aligned.  Prepare to loop over 64-byte chunks.
125//      %edx = length
126//      %edi = ptr
127//      %eax = zero
128
129LDestAligned:
130        movl    %edx,%ecx
131        andl    $63,%edx                // mask down to residual length (0..63)
132        andl    $-64,%ecx               // get #bytes we will zero in this loop
133        pxor    %xmm0,%xmm0             // zero an SSE register
134        addl    %ecx,%edi               // increment ptr by length to move
135        negl    %ecx			// negate length to move
136	jmp	1f
137
138// Loop over 64-byte chunks, storing into cache.
139
140	.align	4,0x90			// keep inner loops 16-byte aligned
1411:
142        movdqa  %xmm0,(%edi,%ecx)
143        movdqa  %xmm0,16(%edi,%ecx)
144        movdqa  %xmm0,32(%edi,%ecx)
145        movdqa  %xmm0,48(%edi,%ecx)
146        addl    $64,%ecx
147        jne     1b
148
149	jmp	Lshort
150
151
152
153	COMMPAGE_DESCRIPTOR(bzero_sse42,_COMM_PAGE_BZERO,kHasSSE4_2,0)
154